A library for working with phylogenetic and population genetic data.
v0.32.0
af_spectrum.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
31 #ifdef GENESIS_HTSLIB
32 
33 #include <algorithm>
34 #include <cassert>
35 #include <cmath>
36 #include <functional>
37 #include <limits>
38 #include <stdexcept>
39 
42 
44 
45 namespace genesis {
46 namespace population {
47 
48 // =================================================================================================
49 // Constructors and Rule of Five
50 // =================================================================================================
51 
52 AlleleFrequencyWindow::AlleleFrequencyWindow( size_t width, size_t number_of_bins )
53  : number_of_bins_( number_of_bins )
54  , window_generator_( SlidingWindowType::kInterval, width )
55 {
56  // Set the plugin functions.
57  window_generator_.add_chromosome_start_plugin( [&]( std::string const& chromosome, AFWindow::Accumulator& accu ){
58  on_chromosome_start_( chromosome, accu );
59  });
60  window_generator_.add_chromosome_finish_plugin( [&]( std::string const& chromosome, AFWindow::Accumulator& accu ){
61  on_chromosome_finish_( chromosome, accu );
62  });
63  window_generator_.add_emission_plugin( [&](
64  AFWindow const& window
65  ){
66  on_emission_( window );
67  });
68 }
69 
71 {
72  window_generator_.finish_chromosome();
73 }
74 
75 // =================================================================================================
76 // Window Processing
77 // =================================================================================================
78 
79 void AlleleFrequencyWindow::run_vcf( std::string const& vcf_file )
80 {
81  for( auto record = VcfInputStream( vcf_file ); record; ++record ) {
82  enqueue( *record );
83  }
84 }
85 
87  std::string const& chromosome, size_t position, double frequency
88 ) {
89  if( ! std::isfinite(frequency) || frequency < 0.0 || frequency > 1.0 ) {
90  throw std::invalid_argument(
91  "Invalid allele frequency " + std::to_string(frequency) + " at " +
92  chromosome + ":" + std::to_string( position )
93  );
94  }
95  window_generator_.enqueue( chromosome, position, frequency );
96 }
97 
99 {
100  // Check that the record is one that we can use, and either skip or throw if not.
101  if( ! record.is_snp() || record.get_alternatives_count() != 1 || ! record.has_format( "AD" ) ) {
102  if( skip_invalid_records_ ) {
103  return;
104  } else {
105  throw std::runtime_error(
106  "Invalid VCF Record for Allele Frequency Window that is either not a biallelic SNP "
107  "or does not have the FORMAT field `AD`."
108  );
109  }
110  }
111 
112  // Sum up all allelic depth values for all samples of the record line.
113  size_t ref = 0;
114  size_t alt = 0;
115  for( auto const& ad_field : record.get_format_int("AD") ) {
116  if( ad_field.values_per_sample() != 2 ) {
117  throw std::runtime_error(
118  "Invalid VCF Record that claims to be biallelic, but in fact contains more than "
119  "two values for the FORMAT field `AD` for a sample."
120  );
121  }
122 
123  ref += ad_field.get_value_at(0);
124  alt += ad_field.get_value_at(1);
125  }
126 
127  // Compute the allele frequency based on the counts, and store them in the window.
128  // Here, we simply skip invalid values, which can happen if all AD fields are zero.
129  // Rare, so we don't want to fail here. Simply ignore them.
130  double freq = static_cast<double>(alt) / static_cast<double>( alt + ref );
131  if( ! std::isfinite(freq) ) {
132  return;
133  }
134  window_generator_.enqueue( record.get_chromosome(), record.get_position(), freq );
135 
136  // The AF field just counts frequencies based on the called genotype data (e.g. `0|1`),
137  // but does not take the actual frequencies / allelic depths into account.
138  // Hence, we only keep the following code for reference, in case that this information
139  // is needed at some point.
140  // auto const af = record.get_info_float("AF");
141  // if( af.size() != 1 ) {
142  // throw std::runtime_error(
143  // "Invalid allele frequency (`AF`) field in VCF record at " + record.at() + " with size" +
144  // std::to_string( af.size() ) + " instead of expected size 1."
145  // );
146  // }
147  // window_generator_.enqueue( record.get_chromosome(), record.get_position(), af[0] );
148 }
149 
150 // =================================================================================================
151 // Internal Members
152 // =================================================================================================
153 
154 void AlleleFrequencyWindow::on_chromosome_start_(
155  std::string const& chromosome,
156  AFWindow::Accumulator&
157 ) {
158  spectra_.emplace_back( chromosome );
159  assert( ! spectra_.empty() );
160  assert( spectra_.back().chromosome == chromosome );
161 
162  // Not sure how that function might be handy, but let's offer it anyways.
163  if( on_chromosome_start ) {
164  on_chromosome_start( spectra_.back() );
165  }
166 }
167 
168 void AlleleFrequencyWindow::on_chromosome_finish_(
169  std::string const& chromosome,
170  AFWindow::Accumulator&
171 ) {
172  assert( ! spectra_.empty() );
173  assert( spectra_.back().chromosome == chromosome );
174  (void) chromosome;
175 
176  // Make bitmap or whatever the user wants.
177  if( on_chromosome_finish ) {
178  on_chromosome_finish( spectra_.back() );
179  }
180 }
181 
182 void AlleleFrequencyWindow::on_emission_(
183  AFWindow const& window
184 ) {
185  assert( ! spectra_.empty() );
186  auto& values = spectra_.back().values;
187  values.emplace_back( number_of_bins_, 0 );
188  auto& bins = values.back();
189 
190  // Collect all data from the window and fill the frequency bins.
191  for( auto const& entry : window ) {
192  if( ! std::isfinite(entry.data) || entry.data < 0.0 || entry.data > 1.0 ) {
193  throw std::invalid_argument(
194  "Invalid allele frequency " + std::to_string(entry.data) + " at " +
195  spectra_.back().chromosome + ":" + std::to_string( entry.position )
196  );
197  }
198 
199  // Bring the value into the bins and store it. We need a special case for the exact value
200  // of 1.0 here, so that we don't get an overflow of the bin index.
201  size_t index = std::floor( entry.data * static_cast<double>( number_of_bins_ ));
202  index = std::min( index, number_of_bins_ - 1 );
203  ++bins[ index ];
204  }
205 }
206 
207 } // namespace population
208 } // namespace genesis
209 
210 #endif // htslib guard
genesis::population::VcfRecord::is_snp
bool is_snp() const
Return whether this variant is a SNP.
Definition: vcf_record.cpp:304
genesis::population::AlleleFrequencyWindow::AlleleFrequencyWindow
AlleleFrequencyWindow(size_t width, size_t number_of_bins=100)
Definition: af_spectrum.cpp:52
genesis::population::SlidingWindowGenerator::add_emission_plugin
self_type & add_emission_plugin(on_emission const &plugin)
Add an on_emission plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:482
genesis::population::AlleleFrequencyWindow::~AlleleFrequencyWindow
~AlleleFrequencyWindow()
Definition: af_spectrum.cpp:70
genesis::population::Window
Window over the chromosomes of a genome.
Definition: window.hpp:105
genesis::population::AlleleFrequencyWindow::on_chromosome_start
std::function< void(Spectrum const &)> on_chromosome_start
Definition: af_spectrum.hpp:85
genesis::population::SlidingWindowGenerator::finish_chromosome
void finish_chromosome(size_t last_position=0)
Explicitly finish a chromosome, and emit all remaining Windows.
Definition: sliding_window_generator.hpp:611
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
af_spectrum.hpp
genesis::population::SlidingWindowType
SlidingWindowType
SlidingWindowType of a Window, that is, whether we slide along a fixed size interval of the genome,...
Definition: sliding_window_generator.hpp:55
genesis::population::VcfRecord::get_chromosome
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
Definition: vcf_record.cpp:170
genesis::population::Window::Accumulator
A Accumulator
Definition: window.hpp:114
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::AlleleFrequencyWindow::enqueue
void enqueue(std::string const &chromosome, size_t position, double frequency)
Definition: af_spectrum.cpp:86
genesis::population::VcfRecord::has_format
bool has_format(std::string const &id) const
Return whether the record has a given FORMAT id present.
Definition: vcf_record.cpp:522
genesis::population::SlidingWindowGenerator::add_chromosome_start_plugin
self_type & add_chromosome_start_plugin(on_chromosome_start const &plugin)
Add an on_chromosome_start plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:446
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data const &data)
Enqueue a new Data value.
Definition: sliding_window_generator.hpp:534
genesis::population::VcfRecord::get_position
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
Definition: vcf_record.cpp:181
genesis::population::SlidingWindowType::kInterval
@ kInterval
Windows of this type are defined by a fixed start and end position on a chromosome.
genesis::population::AlleleFrequencyWindow::on_chromosome_finish
std::function< void(Spectrum const &)> on_chromosome_finish
Definition: af_spectrum.hpp:86
writer.hpp
genesis::population::SlidingWindowGenerator::add_chromosome_finish_plugin
self_type & add_chromosome_finish_plugin(on_chromosome_finish const &plugin)
Add an on_chromosome_finish plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:455
genesis::population::VcfInputStream
Iterate an input source and parse it as a VCF/BCF file.
Definition: vcf_input_stream.hpp:76
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::VcfRecord::get_alternatives_count
size_t get_alternatives_count() const
Get the number of alternative alleles/sequences of the variant (ALT, fifth column of the line).
Definition: vcf_record.cpp:239
genesis::population::VcfRecord::get_format_int
genesis::utils::Range< VcfFormatIteratorInt > get_format_int(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as an int value.
Definition: vcf_record.cpp:598
vcf_input_stream.hpp
genesis::population::AlleleFrequencyWindow::run_vcf
void run_vcf(std::string const &vcf_file)
Definition: af_spectrum.cpp:79