A library for working with phylogenetic data.
v0.25.0
variant.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 extern "C" {
37  #include <htslib/hts.h>
38  #include <htslib/vcf.h>
39 }
40 
41 #include <array>
42 #include <cassert>
43 #include <cstring>
44 #include <iostream>
45 #include <stdexcept>
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // Helper Functions
52 // =================================================================================================
53 
55 {
56  return merge( variant.samples );
57 }
58 
59 std::array<std::pair<char, size_t>, 4> sorted_variant_counts(
60  Variant const& variant, bool reference_first
61 ) {
62  // We use sorting networks for speed here. See f_st_asymptotically_unbiased_a1n1a2n2()
63  // for details on the technique.
64 
65  auto const total = total_base_counts( variant );
66  if( reference_first ) {
67  std::array<std::pair<char, size_t>, 4> result;
68  switch( variant.reference_base ) {
69  case 'a':
70  case 'A': {
71  result[0] = { 'A', total.a_count };
72  result[1] = { 'C', total.c_count };
73  result[2] = { 'G', total.g_count };
74  result[3] = { 'T', total.t_count };
75  break;
76  }
77  case 'c':
78  case 'C': {
79  result[0] = { 'C', total.c_count };
80  result[1] = { 'A', total.a_count };
81  result[2] = { 'G', total.g_count };
82  result[3] = { 'T', total.t_count };
83  break;
84  }
85  case 'g':
86  case 'G': {
87  result[0] = { 'G', total.g_count };
88  result[1] = { 'A', total.a_count };
89  result[2] = { 'C', total.c_count };
90  result[3] = { 'T', total.t_count };
91  break;
92  }
93  case 't':
94  case 'T': {
95  result[0] = { 'T', total.t_count };
96  result[1] = { 'A', total.a_count };
97  result[2] = { 'C', total.c_count };
98  result[3] = { 'G', total.g_count };
99  break;
100  }
101  default: {
102  throw std::runtime_error(
103  "Invalid reference base character " + utils::char_to_hex( variant.reference_base )
104  );
105  }
106  }
107  if( result[1].second < result[2].second ) {
108  std::swap( result[1], result[2] );
109  }
110  if( result[1].second < result[3].second ) {
111  std::swap( result[1], result[3] );
112  }
113  if( result[2].second < result[3].second ) {
114  std::swap( result[2], result[3] );
115  }
116  return result;
117  } else {
118  std::array<std::pair<char, size_t>, 4> result = {
119  std::pair<char, size_t>{ 'A', total.a_count },
120  std::pair<char, size_t>{ 'C', total.c_count },
121  std::pair<char, size_t>{ 'G', total.g_count },
122  std::pair<char, size_t>{ 'T', total.t_count }
123  };
124  if( result[0].second < result[1].second ) {
125  std::swap( result[0], result[1] );
126  }
127  if( result[2].second < result[3].second ) {
128  std::swap( result[2], result[3] );
129  }
130  if( result[0].second < result[2].second ) {
131  std::swap( result[0], result[2] );
132  }
133  if( result[1].second < result[3].second ) {
134  std::swap( result[1], result[3] );
135  }
136  if( result[1].second < result[2].second ) {
137  std::swap( result[1], result[2] );
138  }
139  return result;
140  }
141 }
142 
143 // =================================================================================================
144 // Conversion Functions
145 // =================================================================================================
146 
147 std::ostream& to_sync( Variant const& var, std::ostream& os )
148 {
149  os << var.chromosome << "\t" << var.position << "\t" << var.reference_base;
150  for( auto const& bs : var.samples ) {
151  os << "\t";
152  to_sync( bs, os );
153  }
154  os << "\n";
155  return os;
156 }
157 
159  SimplePileupReader::Record const& record,
160  unsigned char min_phred_score
161 ) {
162  // Set basic data
163  Variant result;
164  result.chromosome = record.chromosome;
165  result.position = record.position;
166  result.reference_base = utils::to_upper( record.reference_base );
167 
168  // Convert the individual samples
169  result.samples.reserve( record.samples.size() );
170  for( size_t i = 0; i < record.samples.size(); ++i ) {
171  result.samples.push_back( convert_to_base_counts( record.samples[i], min_phred_score ));
172  }
173 
174  // Pileup does not contain ALT bases, so infer them from counts,
175  // using the base with the most counts that is not the reference base.
176  // We only do this if we have a reference base though, as otherwise, the sorting and alternative
177  // is meaningless anyway. Only need to check upper case here, as we converted above.
178  if(
179  result.reference_base == 'A' ||
180  result.reference_base == 'C' ||
181  result.reference_base == 'G' ||
182  result.reference_base == 'T'
183  ) {
184  auto const sorted = sorted_variant_counts( result, true );
185  result.alternative_base = utils::to_upper( sorted[1].first );
186  }
187 
188  return result;
189 }
190 
191 #ifdef GENESIS_HTSLIB
192 
194 {
195  // Error check.
196  if( ! record.has_format( "AD" )) {
197  throw std::runtime_error(
198  "Cannot convert VcfRecord to Variant, as the VcfRecord does not have "
199  "the required FORMAT field 'AD'"
200  );
201  }
202 
203  // Get all variants (REF and ALT), and check them. We manually add deletion here if ALT == ".",
204  // as this is not part of the variants provided from htslib.
205  // There are only 6 possible single nucleotide variants (ACGTN.), so we save the effort of
206  // creating an intermediate vector, and use a fixed size array and a counter instead.
207  // Also, we use htslib functions directly on the vcf record to not have to go through
208  // string allocations for each alternative.
209  record.unpack();
210  auto rec_data = record.data();
211 
212  // The n_allele count does not include deletions ('.'), meaning that if there is inly a single
213  // variant, we manually adjust this to also include the deletion.
214  // To avoid too much branching, we init the array so that we have all deletions initially,
215  // and hence do not need to overwrite in the case that we added that deletion manually
216  // to the counter.
217  size_t const var_cnt = rec_data->n_allele == 1 ? rec_data->n_allele + 1 : rec_data->n_allele;
218  auto vars = std::array<char, 6>{ '.', '.', '.', '.', '.', '.' };
219  if( var_cnt > 6 ) {
220  throw std::runtime_error(
221  "Invalid VCF Record that contains a REF or ALT sequence/allele with "
222  "invalid nucleitides where only `[ACGTN.]` are allowed."
223  );
224  }
225 
226  // Now store all single nucleotide alleles that are in the record
227  // (we only count up to the actual number that is there, so that the missing deletion [in case
228  // that this record has a deletion] is not touched).
229  for( size_t i = 0; i < rec_data->n_allele; ++i ) {
230  if( std::strlen( rec_data->d.allele[i] ) != 1 ) {
231  throw std::runtime_error(
232  "Cannot convert VcfRecord to Variant, as one of the VcfRecord REF or ALT "
233  "sequences/alleles is not a single nucleotide."
234  );
235  }
236  vars[i] = *rec_data->d.allele[i];
237  }
238 
239  // Prepare common fields of the result.
240  // For the reference base, we use the first nucleotide of the first variant (REF);
241  // above, we have ensured that this exists and is in fact a single nucleotide only.
242  // Same for the alternative base, where we use the first ALT in the record.
243  Variant result;
244  result.chromosome = record.get_chromosome();
245  result.position = record.get_position();
246  result.reference_base = vars[0];
247  result.alternative_base = vars[1]; // TODO this is only reasonable for biallelic SNPs
248 
249  // Process the samples that are present in the VCF record line.
250  result.samples.reserve( record.header().get_sample_count() );
251  for( auto const& sample_ad : record.get_format_int("AD") ) {
252  if( sample_ad.valid_value_count() > 0 && sample_ad.valid_value_count() != var_cnt ) {
253  throw std::runtime_error(
254  "Invalid VCF Record that contains " + std::to_string( var_cnt ) +
255  " REF and ALT sequences/alleles, but its FORMAT field 'AD' only contains " +
256  std::to_string( sample_ad.valid_value_count() ) + " entries"
257  );
258  }
259 
260  // Go through all REF and ALT entries and their respective FORMAT 'AD' counts,
261  // and sum them up, storing them in a new BaseCounts instance at the end of the vector.
262  result.samples.emplace_back();
263  auto& sample = result.samples.back();
264  for( size_t i = 0; i < sample_ad.valid_value_count(); ++i ) {
265 
266  // Get the nucleitide and its count.
267  auto const cnt = sample_ad.get_value_at(i);
268  if( cnt < 0 ) {
269  throw std::runtime_error(
270  "Invalid VCF Record with FORMAT field 'AD' value < 0 for a sample"
271  );
272  }
273 
274  // Add it to the respective count variable of the sample.
275  switch( vars[i] ) {
276  case 'a':
277  case 'A': {
278  sample.a_count = cnt;
279  break;
280  }
281  case 'c':
282  case 'C': {
283  sample.c_count = cnt;
284  break;
285  }
286  case 'g':
287  case 'G': {
288  sample.g_count = cnt;
289  break;
290  }
291  case 't':
292  case 'T': {
293  sample.t_count = cnt;
294  break;
295  }
296  case 'n':
297  case 'N': {
298  sample.n_count = cnt;
299  break;
300  }
301  case '.': {
302  sample.d_count = cnt;
303  break;
304  }
305  default: {
306  throw std::runtime_error(
307  "Invalid VCF Record that contains a REF or ALT sequence/allele with "
308  "invalid nucleitide `" + std::string( 1, vars[i] ) + "` where only `[ACGTN.]` "
309  "are allowed."
310  );
311  }
312  }
313  }
314  }
315 
316  // Last proof check.
317  if( result.samples.size() != record.header().get_sample_count() ) {
318  throw std::runtime_error(
319  "Invalid VCF Record with number of samples in the record (" +
320  std::to_string( result.samples.size() ) + ") not equal to the number of samples given "
321  "in the VCF header (" + std::to_string( record.header().get_sample_count() ) + ")"
322  );
323  }
324 
325  return result;
326 }
327 
328 #endif // htslib guard
329 
330 } // namespace population
331 } // namespace genesis
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
variant.hpp
genesis::population::VcfRecord::header
VcfHeader & header()
Return the VcfHeader instance associated with this record.
Definition: vcf_record.hpp:213
genesis::population::sorted_variant_counts
std::array< std::pair< char, size_t >, 4 > sorted_variant_counts(Variant const &variant, bool reference_first)
Definition: variant.cpp:59
genesis::population::convert_to_variant
Variant convert_to_variant(SimplePileupReader::Record const &record, unsigned char min_phred_score)
Definition: variant.cpp:158
genesis::population::SimplePileupReader::Record::position
size_t position
Definition: simple_pileup_reader.hpp:144
genesis::population::Variant::position
size_t position
Definition: variant.hpp:65
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:66
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:227
genesis::population::VcfRecord::data
::bcf1_t * data()
Return the internal htslib bcf1_t record data struct pointer.
Definition: vcf_record.hpp:194
genesis::population::VcfHeader::get_sample_count
size_t get_sample_count() const
Get the number of samples (columns) in the file.
Definition: vcf_header.cpp:344
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:141
genesis::population::SimplePileupReader::Record::reference_base
char reference_base
Definition: simple_pileup_reader.hpp:145
genesis::population::convert_to_base_counts
BaseCounts convert_to_base_counts(SimplePileupReader::Sample const &sample, unsigned char min_phred_score)
Definition: base_counts.cpp:245
genesis::population::Variant::samples
std::vector< BaseCounts > samples
Definition: variant.hpp:69
genesis::population::merge
BaseCounts merge(BaseCounts const &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss.
Definition: base_counts.cpp:153
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis::population::VcfRecord::get_chromosome
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
Definition: vcf_record.cpp:170
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
base_counts.hpp
genesis::population::VcfRecord::has_format
bool has_format(std::string const &id) const
Return whether the record has a given FORMAT id present.
Definition: vcf_record.cpp:487
genesis::population::total_base_counts
BaseCounts total_base_counts(Variant const &variant)
Definition: variant.cpp:54
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:53
char.hpp
genesis::population::VcfRecord::get_position
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
Definition: vcf_record.cpp:175
genesis::population::to_sync
std::ostream & to_sync(BaseCounts const &bs, std::ostream &os)
Output a BaseCounts instance to a stream in the PoPoolation2 sync format.
Definition: base_counts.cpp:352
genesis::population::SimplePileupReader::Record::chromosome
std::string chromosome
Definition: simple_pileup_reader.hpp:143
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
genesis::population::SimplePileupReader::Record::samples
std::vector< Sample > samples
Definition: simple_pileup_reader.hpp:146
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:67
genesis::population::VcfRecord::unpack
void unpack() const
Unpack the htslib bcf1_t record data.
Definition: vcf_record.cpp:165
genesis::population::VcfRecord::get_format_int
genesis::utils::Range< VcfFormatIteratorInt > get_format_int(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as an int value.
Definition: vcf_record.cpp:563
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:64