A library for working with phylogenetic and population genetic data.
v0.32.0
vcf_window.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_VCF_WINDOW_H_
2 #define GENESIS_POPULATION_WINDOW_VCF_WINDOW_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
36 #include <cassert>
37 #include <functional>
38 #include <string>
39 #include <vector>
40 
45 
46 namespace genesis {
47 namespace population {
48 
49 // =================================================================================================
50 // VCF Window Generator
51 // =================================================================================================
52 
72 template<class Data, class Accumulator = EmptyAccumulator>
75  std::string const& vcf_file,
76  std::function<Data( VcfRecord const& )> conversion,
77  std::function<bool( VcfRecord const& )> condition = {}
78 ) {
79  size_t current_chr_len = 0;
80  for( auto record = VcfInputStream( vcf_file ); record; ++record ) {
81 
82  // Check if we want to process this record at all.
83  if( condition && ! condition( *record )) {
84  continue;
85  }
86 
87  // Get the chromosome that this record belongs to.
88  auto const rec_chromosome = record->get_chromosome();
89 
90  // We need to take care of changes of the chromosome, that is, either at the very
91  // beginning of the VCF file, or at actual changes within the file.
92  // When finishing a chromosome, we want to use its proper length from the VCF header.
93  if( rec_chromosome != generator.chromosome() ) {
94 
95  // We store the length, and also make sure that we get the same value via two
96  // different methods. Safe is safe.
97  current_chr_len = record->header().get_chromosome_length( rec_chromosome );
98  assert(
99  record->header().get_chromosome_values( rec_chromosome ).count("length") == 0 ||
100  std::stoul(
101  record->header().get_chromosome_values( rec_chromosome ).at("length")
102  ) == current_chr_len
103  );
104 
105  // If there has been data enqueued before, this is not the first data point of the
106  // chromosome. In that case, we need to finish the previous chromosome first,
107  // and do so using its length to get the full interval covered and closed.
108  if( ! generator.empty() ) {
109 
110  // We might not have usable chromosome length information in the header.
111  // In that case, simply finish the current chromosome where it is.
112  // (As of now, finish_chromosome uses 0 as default value anyway, so we could just
113  // call it directly without the extra condition here. But this way seems a bit
114  // more future proof, if we need to change this and use overloads of
115  // finish_chromosome instead.)
116  if( current_chr_len == 0 ) {
117  generator.finish_chromosome();
118  } else {
119  generator.finish_chromosome( current_chr_len );
120  }
121  }
122 
123  } else {
124  // If we did not change chromosomes, we expect their length not to change.
125  assert( record->header().get_chromosome_length( rec_chromosome ) == current_chr_len );
126  }
127 
128  // Finally, enqueue the new data.
129  generator.enqueue( rec_chromosome, record->get_position(), conversion( *record ));
130  }
131 
132  // Now that we are done with the whole file, we also need to finish and close the
133  // last remaining chromosome interval properly. Same as above.
134  if( current_chr_len == 0 ) {
135  generator.finish_chromosome();
136  } else {
137  generator.finish_chromosome( current_chr_len );
138  }
139 }
140 
141 } // namespace population
142 } // namespace genesis
143 
144 #endif // htslib guard
145 #endif // include guard
genesis::population::SlidingWindowGenerator::chromosome
std::string const & chromosome() const
Get the chromosome name that we are currently processing.
Definition: sliding_window_generator.hpp:408
genesis::population::SlidingWindowGenerator::finish_chromosome
void finish_chromosome(size_t last_position=0)
Explicitly finish a chromosome, and emit all remaining Windows.
Definition: sliding_window_generator.hpp:611
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SlidingWindowGenerator::empty
bool empty() const
Return whether the instance is empty.
Definition: sliding_window_generator.hpp:421
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data const &data)
Enqueue a new Data value.
Definition: sliding_window_generator.hpp:534
window.hpp
genesis::population::run_vcf_window
void run_vcf_window(SlidingWindowGenerator< Data, Accumulator > &generator, std::string const &vcf_file, std::function< Data(VcfRecord const &)> conversion, std::function< bool(VcfRecord const &)> condition={})
Convenience function to iterate over a whole VCF file.
Definition: vcf_window.hpp:73
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
vcf_record.hpp
vcf_input_stream.hpp
sliding_window_generator.hpp
genesis::population::SlidingWindowGenerator
Generator for sliding Windows over the chromosomes of a genome.
Definition: sliding_window_generator.hpp:173