A library for working with phylogenetic data.
v0.25.0
vcf_parallel_input_iterator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_VCF_PARALLEL_INPUT_ITERATOR_H_
2 #define GENESIS_POPULATION_FORMATS_VCF_PARALLEL_INPUT_ITERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
37 
38 #include <cassert>
39 #include <stdexcept>
40 #include <string>
41 #include <vector>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // VCF/BCF Parallel Input Iterator
48 // =================================================================================================
49 
58 {
59 public:
60 
61  // -------------------------------------------------------------------------
62  // Member Types
63  // -------------------------------------------------------------------------
64 
66 
67  // -------------------------------------------------------------------------
68  // Constructors and Rule of Five
69  // -------------------------------------------------------------------------
70 
74  VcfParallelInputIterator() = default;
75 
80  std::vector<std::string> const& filenames,
81  size_t block_size = 1024
82  ) {
83  if( filenames.empty() ) {
84  throw std::invalid_argument(
85  "Cannot construct a VcfParallelInputIterator with an empty list of files. "
86  "Use the default constructor instead."
87  );
88  }
89 
90  iterators_.reserve( filenames.size() );
91  for( size_t i = 0; i < filenames.size(); ++i ) {
92  iterators_.emplace_back( filenames[i], block_size );
93  }
94 
95  // "Increment" with an offset of zero, that is, move to the first position that
96  // is present in all iterators.
97  increment_( 0 );
98  }
99 
100  ~VcfParallelInputIterator() = default;
101 
102  VcfParallelInputIterator( self_type const& ) = default;
103  VcfParallelInputIterator( self_type&& ) = default;
104 
105  self_type& operator= ( self_type const& ) = default;
106  self_type& operator= ( self_type&& ) = default;
107 
108  // -------------------------------------------------------------------------
109  // Comparators
110  // -------------------------------------------------------------------------
111 
112  explicit operator bool() const
113  {
114  return good();
115  }
116 
117  bool good() const
118  {
119  assert( ! iterators_.empty() );
120  assert(
121  [&](){
122  for( size_t i = 1; i < iterators_.size(); ++i ) {
123  if( iterators_[i].good() != iterators_[0].good() ) {
124  return false;
125  }
126  }
127  return true;
128  }()
129  );
130 
131  return iterators_[0].good();
132  }
133 
134  // -------------------------------------------------------------------------
135  // Accessors
136  // -------------------------------------------------------------------------
137 
138  VcfInputIterator const& operator []( size_t index ) const
139  {
140  assert( index < iterators_.size() );
141  return iterators_[index];
142  }
143 
144  VcfInputIterator& operator []( size_t index )
145  {
146  assert( index < iterators_.size() );
147  return iterators_[index];
148  }
149 
150  size_t size() const
151  {
152  return iterators_.size();
153  }
154 
155  std::vector<VcfInputIterator>::iterator begin()
156  {
157  return iterators_.begin();
158  }
159 
160  std::vector<VcfInputIterator>::const_iterator begin() const
161  {
162  return iterators_.cbegin();
163  }
164 
165  std::vector<VcfInputIterator>::iterator end()
166  {
167  return iterators_.end();
168  }
169 
170  std::vector<VcfInputIterator>::const_iterator end() const
171  {
172  return iterators_.cend();
173  }
174 
175  // -------------------------------------------------------------------------
176  // Iteration
177  // -------------------------------------------------------------------------
178 
180  {
181  increment_( 1 );
182  return *this;
183  }
184 
186  {
187  increment_( 1 );
188  return *this;
189  }
190 
191  std::string get_chromosome() const
192  {
193  assert( ! iterators_.empty() );
194  return iterators_[0]->get_chromosome();
195  }
196 
197  size_t get_position() const
198  {
199  assert( ! iterators_.empty() );
200  return iterators_[0]->get_position();
201  }
202 
203  // -------------------------------------------------------------------------
204  // Private Members
205  // -------------------------------------------------------------------------
206 
207 private:
208 
209  void increment_( size_t offset )
210  {
211  // In the default constructed case, this will fail - on purpose,
212  // because that is an error on the users end.
213  assert( ! iterators_.empty() );
214 
215  // Try a new postion. We use the immediate next as a starting point.
216  auto try_chr = iterators_[0]->get_chromosome();
217  auto try_pos = iterators_[0]->get_position() + offset;
218 
220 
221  LOG_DBG << "go";
222 
223  // Now see if we can get there, or try again as long as needed.
224  bool hot = true;
225  while( hot ) {
226  hot = false;
227 
228  LOG_DBG1 << "hot " << try_chr << " " << try_pos;
229 
230  // Go through all iterators, and see if we can reach the new position.
231  // If not, that is, if one of them overshoots because it does not have that position,
232  // we run another loop and try again.
233  for( size_t i = 0; i < iterators_.size(); ++i ) {
234 
235  LOG_DBG2 << "i " << i;
236 
237  // For the current iterator, advance as long as we are behind our target position.
238  while(
239  iterators_[i].good() &&
240  (
241  iterators_[i]->get_chromosome() < try_chr ||
242  (
243  iterators_[i]->get_chromosome() == try_chr &&
244  iterators_[i]->get_position() < try_pos
245  )
246  )
247  ) {
248  ++iterators_[i];
249 
250  LOG_DBG3 << "a " << iterators_[i]->get_chromosome() << " " << iterators_[i]->get_position();
251 
252  // Need to ensure that the files are sorted properly.
253  // Otherwise, nasty weird behaviour could occur.
254  if(
255  iterators_[i].good() &&
256  iterators_[i]->get_chromosome() < try_chr &&
257  iterators_[i]->get_position() < try_pos
258  ) {
259  LOG_ERR << "try " << try_chr << " " << try_pos;
260  LOG_ERR << "err " << iterators_[i]->get_chromosome() << " " << iterators_[i]->get_position();
261 
262  throw std::runtime_error(
263  "Input VCF/VCF is not sorted by chromosome and position. "
264  "Cannot use VcfParallelInputIterator on such files."
265  );
266  }
267  }
268 
269  // Now check what we got. Different outcomes require different actions.
270 
271  // The iterator is at the end.
272  if( ! iterators_[i].good() ) {
273  LOG_DBG2 << "ng";
274 
275  // We set all others to end as well, and are done here.
276  // This is a bit of a shortcut, as we do not advance them,
277  // but just set them to default constructed iterators, but that should work
278  // fine for most applications. We lose the abolity to read their filenames
279  // and other non-iterator stuff, but that should be okay for now.
280  for( size_t i = 0; i < iterators_.size(); ++i ) {
281  iterators_[i] = VcfInputIterator();
282  }
283 
284  // We could simply return here. Instead, we break out of the per-iterator loop.
285  // Then, the outer loop will also terminate, and we will then run the final
286  // assertion.
287  break;
288  }
289  assert( iterators_[i].good() );
290 
291  // The iterator is on a new chromosome or at a position behind the one we tried for.
292  if(
293  iterators_[i]->get_chromosome() > try_chr ||
294  iterators_[i]->get_position() > try_pos
295  ) {
296  LOG_DBG2 << "o " << iterators_[i]->get_chromosome() << " " << iterators_[i]->get_position();
297 
298  // The current try is not part of that iterator. We use it as our new target,
299  // and move to another iteration of the outer loop.
300  try_chr = iterators_[i]->get_chromosome();
301  try_pos = iterators_[i]->get_position();
302  hot = true;
303  break;
304  }
305 
306  // If we are here, we must have found the position!
307  // Assert this, but then, nothing further needs to be done here.
308  assert( try_chr == iterators_[i]->get_chromosome() );
309  assert( try_pos == iterators_[i]->get_position() );
310  }
311  }
312 
313  LOG_DBG << "done";
314 
315  // Assert that all iterators are the same now - either end, or same chr and pos.
316  assert(
317  [&](){
318  for( size_t i = 1; i < iterators_.size(); ++i ) {
319  if(
320  iterators_[i].good() != iterators_[0].good() ||
321  (
322  iterators_[i].good() && (
323  iterators_[i]->get_chromosome() != iterators_[0]->get_chromosome() ||
324  iterators_[i]->get_position() != iterators_[0]->get_position()
325  ))
326  ) {
327  return false;
328  }
329  }
330  return true;
331  }()
332  );
333  }
334 
335  // -------------------------------------------------------------------------
336  // Data Members
337  // -------------------------------------------------------------------------
338 
339 private:
340 
341  std::vector<VcfInputIterator> iterators_;
342 
343 };
344 
345 } // namespace population
346 } // namespace genesis
347 
348 #endif // htslib guard
349 #endif // include guard
genesis::population::VcfParallelInputIterator::get_position
size_t get_position() const
Definition: vcf_parallel_input_iterator.hpp:197
genesis::population::VcfParallelInputIterator::self_type
VcfParallelInputIterator self_type
Definition: vcf_parallel_input_iterator.hpp:65
genesis::population::VcfParallelInputIterator::~VcfParallelInputIterator
~VcfParallelInputIterator()=default
genesis::population::VcfParallelInputIterator::get_chromosome
std::string get_chromosome() const
Definition: vcf_parallel_input_iterator.hpp:191
genesis::population::VcfParallelInputIterator
Iterate a set of input VCF/BCF file in parallel, that is, synchronize the iteration to be at the same...
Definition: vcf_parallel_input_iterator.hpp:57
genesis::utils::Logging::kInfo
@ kInfo
Infos, for example when a file was written. See LOG_INFO.
Definition: logging.hpp:416
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
genesis::population::VcfParallelInputIterator::VcfParallelInputIterator
VcfParallelInputIterator(std::vector< std::string > const &filenames, size_t block_size=1024)
Create an instance that reads from a set of input file names.
Definition: vcf_parallel_input_iterator.hpp:79
LOG_DBG2
#define LOG_DBG2
Log a debug message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:124
LOG_ERR
#define LOG_ERR
Log an error. See genesis::utils::LoggingLevel.
Definition: logging.hpp:94
genesis::population::VcfParallelInputIterator::good
bool good() const
Definition: vcf_parallel_input_iterator.hpp:117
genesis::population::VcfParallelInputIterator::begin
std::vector< VcfInputIterator >::iterator begin()
Definition: vcf_parallel_input_iterator.hpp:155
genesis::population::VcfParallelInputIterator::size
size_t size() const
Definition: vcf_parallel_input_iterator.hpp:150
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VcfParallelInputIterator::end
std::vector< VcfInputIterator >::const_iterator end() const
Definition: vcf_parallel_input_iterator.hpp:170
LOG_DBG1
#define LOG_DBG1
Log a debug message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:121
LOG_SCOPE_LEVEL
#define LOG_SCOPE_LEVEL(level)
This macro sets the logging level to a certain value for this scope and sets it back to the original ...
Definition: logging.hpp:165
genesis::population::VcfParallelInputIterator::operator[]
VcfInputIterator const & operator[](size_t index) const
Definition: vcf_parallel_input_iterator.hpp:138
vcf_input_iterator.hpp
genesis::population::VcfParallelInputIterator::operator=
self_type & operator=(self_type const &)=default
genesis::population::VcfParallelInputIterator::begin
std::vector< VcfInputIterator >::const_iterator begin() const
Definition: vcf_parallel_input_iterator.hpp:160
LOG_DBG3
#define LOG_DBG3
Log a debug message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:127
genesis::population::VcfParallelInputIterator::operator++
self_type & operator++()
Definition: vcf_parallel_input_iterator.hpp:179
LOG_DBG
#define LOG_DBG
Log a debug message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:118
genesis::population::VcfParallelInputIterator::VcfParallelInputIterator
VcfParallelInputIterator()=default
Create a default instance, with no input. This is also the past-the-end iterator.
genesis::population::VcfInputIterator
Iterate an input source and parse it as a VCF/BCF file.
Definition: vcf_input_iterator.hpp:91
genesis::population::VcfParallelInputIterator::end
std::vector< VcfInputIterator >::iterator end()
Definition: vcf_parallel_input_iterator.hpp:165