A library for working with phylogenetic and population genetic data.
v0.32.0
fastx_input_view_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_FORMATS_FASTX_INPUT_VIEW_STREAM_H_
2 #define GENESIS_SEQUENCE_FORMATS_FASTX_INPUT_VIEW_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
34 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
35 
41 
42 #include <array>
43 #include <cassert>
44 #include <cstddef>
45 #include <iterator>
46 #include <memory>
47 #include <sstream>
48 #include <stdexcept>
49 #include <string_view>
50 
51 namespace genesis {
52 namespace sequence {
53 
54 // =================================================================================================
55 // Fasta and Fastq Input Stream
56 // =================================================================================================
57 
58 // Simple aliases for more expressive code.
59 class FastxInputViewStream;
60 using FastaInputViewStream = FastxInputViewStream;
61 using FastqInputViewStream = FastxInputViewStream;
62 
101 class FastxInputViewStream
102 {
103 public:
104 
105  // -------------------------------------------------------------------------
106  // Member Types
107  // -------------------------------------------------------------------------
108 
109  using self_type = FastxInputViewStream;
110  using value_type = Sequence;
111  using pointer = value_type*;
112  using reference = value_type&;
113  using difference_type = std::ptrdiff_t;
114  using iterator_category = std::input_iterator_tag;
115 
116  // ======================================================================================
117  // Internal Iterator
118  // ======================================================================================
119 
120 public:
121 
128  class Iterator
129  {
130  // -------------------------------------------------------------------------
131  // Typedefs and Enums
132  // -------------------------------------------------------------------------
133 
134  public:
135 
136  using self_type = FastxInputViewStream::Iterator;
137  using value_type = std::array<std::string_view, 4>;
138  using pointer = value_type const*;
139  using reference = value_type const&;
140  using difference_type = std::ptrdiff_t;
141  using iterator_category = std::input_iterator_tag;
142 
143  // -------------------------------------------------------------------------
144  // Constructors and Rule of Five
145  // -------------------------------------------------------------------------
146 
147  private:
148 
149  Iterator() = default;
150 
151  Iterator( FastxInputViewStream const* parent )
152  : parent_( parent )
153  {
154  // Safeguard
155  if( ! parent_ ) {
156  return;
157  }
158 
159  // Start reading from the input source into a stream.
160  input_stream_ = std::make_shared<utils::InputStream>( parent_->input_source_ );
161 
162  // Check whether the input stream is good (not end-of-stream) and can be read from.
163  // If not, we reached its end, so we stop immediately.
164  if( ! input_stream_ || ! *input_stream_ ) {
165  parent_ = nullptr;
166  input_stream_ = nullptr;
167  sequence_view_ = std::array<std::string_view, 4>();
168  return;
169  }
170 
171  // Check the format. We then stick with it of the rest of the streaming.
172  if( **input_stream_ == '>' ) {
173  format_is_fasta_ = true;
174  } else if( **input_stream_ == '@' ) {
175  format_is_fasta_ = false;
176  } else {
177  throw std::runtime_error(
178  "Malformed fasta/fastq " + input_stream_->source_name() +
179  ", starting with neither '>' nor '@', but instead " +
180  utils::char_to_hex( **input_stream_ )
181  );
182  }
183 
184  // Start streaming the data
185  increment_();
186  }
187 
188  public:
189 
190  ~Iterator() = default;
191 
192  Iterator( self_type const& ) = default;
193  Iterator( self_type&& ) = default;
194 
195  Iterator& operator= ( self_type const& ) = default;
196  Iterator& operator= ( self_type&& ) = default;
197 
198  friend FastxInputViewStream;
199 
200  // -------------------------------------------------------------------------
201  // Iterator Accessors
202  // -------------------------------------------------------------------------
203 
204  self_type const* operator->() const
205  {
206  return this;
207  }
208 
209  self_type* operator->()
210  {
211  return this;
212  }
213 
214  self_type const& operator*() const
215  {
216  return *this;
217  }
218 
219  self_type& operator*()
220  {
221  return *this;
222  }
223 
224  // -------------------------------------------------------------------------
225  // Iteration
226  // -------------------------------------------------------------------------
227 
228  self_type& operator ++ ()
229  {
230  increment_();
231  return *this;
232  }
233 
243  bool operator==( self_type const& it ) const
244  {
245  return parent_ == it.parent_;
246  }
247 
248  bool operator!=( self_type const& it ) const
249  {
250  return !(*this == it);
251  }
252 
253  // -------------------------------------------------------------------------
254  // Sequence Access
255  // -------------------------------------------------------------------------
256 
262  std::string_view const& label() const
263  {
264  return sequence_view_[0];
265  }
266 
273  std::string_view const& label1() const
274  {
275  return sequence_view_[0];
276  }
277 
284  std::string_view const& sites() const
285  {
286  return sequence_view_[1];
287  }
288 
295  std::string_view const& label2() const
296  {
297  return sequence_view_[2];
298  }
299 
307  std::string_view const& quality() const
308  {
309  return sequence_view_[3];
310  }
311 
312  // -------------------------------------------------------------------------
313  // Internal Members
314  // -------------------------------------------------------------------------
315 
316  private:
317 
318  // ---------------------------------------------
319  // Increment and Processing Samples
320  // ---------------------------------------------
321 
322  void increment_()
323  {
324  if( format_is_fasta_ ) {
325  increment_fasta_();
326  } else {
327  increment_fastq_();
328  }
329  }
330 
331  void increment_fasta_()
332  {
333  assert( parent_ );
334 
335  // Check whether the input stream is good (not end-of-stream) and can be read from.
336  // If not, we reached its end, so we stop reading in the next iteration.
337  if( ! input_stream_ || ! *input_stream_ ) {
338  parent_ = nullptr;
339  input_stream_ = nullptr;
340  sequence_view_ = std::array<std::string_view, 4>();
341  return;
342  }
343 
344  // Get the next record. Also give a more user friendly error if this does not work.
345  try {
346  // We do a transfer here to the shared buffer for fastq and fasta.
347  // We could use two buffers instead to avoid this, but that would introduce a
348  // switch in the getters, which is also not nice.
349  auto seqs = input_stream_->get_line_views<2>();
350  sequence_view_[0] = seqs[0];
351  sequence_view_[1] = seqs[1];
352  } catch( std::exception const& ex ) {
353  throw std::runtime_error(
354  "Cannot stream through fasta " + input_stream_->source_name() +
355  " with fast string view parser, either because the file is corrupt, "
356  "or has lines that are too long. Error: " + ex.what()
357  );
358  }
359 
360  // Parse label
361  if( sequence_view_[0].size() < 1 || sequence_view_[0][0] != '>' ) {
362  throw std::runtime_error(
363  "Malformed fasta " + input_stream_->source_name() + ": Expecting '>' at "
364  "beginning of label near line " + std::to_string( input_stream_->line() ) +
365  ". Note that we here can only process fasta with single lines for the " +
366  "sequence and quality data."
367  );
368  }
369  sequence_view_[0].remove_prefix( 1 );
370 
371  // Basic check of sequence length.
372  if( sequence_view_[1].empty() ) {
373  throw std::runtime_error(
374  "Malformed fasta " + input_stream_->source_name() + ": Expecting a " +
375  "sequence sites line after the first label line near line "
376  + std::to_string( input_stream_->line() ) +
377  ". Note that we here can only process fasta with single lines for the " +
378  "sequence and quality data."
379  );
380  }
381  }
382 
383  void increment_fastq_()
384  {
385  assert( parent_ );
386 
387  // Check whether the input stream is good (not end-of-stream) and can be read from.
388  // If not, we reached its end, so we stop reading in the next iteration.
389  if( ! input_stream_ || ! *input_stream_ ) {
390  parent_ = nullptr;
391  input_stream_ = nullptr;
392  sequence_view_ = std::array<std::string_view, 4>();
393  return;
394  }
395 
396  // Get the next record. Also give a more user friendly error if this does not work.
397  try {
398  sequence_view_ = input_stream_->get_line_views<4>();
399  } catch( std::exception const& ex ) {
400  throw std::runtime_error(
401  "Cannot stream through fastq " + input_stream_->source_name() +
402  " with fast string view parser, either because the file is corrupt, "
403  "or has lines that are too long. Error: " + ex.what()
404  );
405  }
406 
407  // Parse label 1
408  if( sequence_view_[0].size() < 1 || sequence_view_[0][0] != '@' ) {
409  throw std::runtime_error(
410  "Malformed fastq " + input_stream_->source_name() + ": Expecting '@' at "
411  "beginning of label near line " + std::to_string( input_stream_->line() ) +
412  ". Note that we here can only process fastq with single lines for the " +
413  "sequence and quality data."
414  );
415  }
416  sequence_view_[0].remove_prefix( 1 );
417 
418  // Parse label 2
419  if( sequence_view_[2].size() < 1 || sequence_view_[2][0] != '+' ) {
420  throw std::runtime_error(
421  "Malformed fastq " + input_stream_->source_name() + ": Expecting '+' at "
422  "beginning of label near line " + std::to_string( input_stream_->line() ) +
423  ". Note that we here can only process fastq with single lines for the " +
424  "sequence and quality data."
425  );
426  }
427  sequence_view_[2].remove_prefix( 1 );
428 
429  // Basic check of sequence and quality length.
430  if( sequence_view_[1].empty() ) {
431  throw std::runtime_error(
432  "Malformed fastq " + input_stream_->source_name() + ": Expecting a " +
433  "sequence sites line after the first label line near line "
434  + std::to_string( input_stream_->line() ) +
435  ". Note that we here can only process fastq with single lines for the " +
436  "sequence and quality data."
437  );
438  }
439  if( sequence_view_[1].size() != sequence_view_[3].size() ) {
440  throw std::runtime_error(
441  "Malformed fastq " + input_stream_->source_name() + ": Expecting the " +
442  "quality scores to be of the same length as the sequence near line " +
443  std::to_string( input_stream_->line() ) +
444  ". Note that we here can only process fastq with single lines for the " +
445  "sequence and quality data."
446  );
447  }
448  }
449 
450  // -------------------------------------------------------------------------
451  // Data Members
452  // -------------------------------------------------------------------------
453 
454  private:
455 
456  // Parent. If null, this indicates the end of the input and that we are done iterating.
457  FastxInputViewStream const* parent_ = nullptr;
458 
459  // Data stream to read from.
460  std::shared_ptr<utils::InputStream> input_stream_;
461 
462  // fasta = true, fastq = false
463  bool format_is_fasta_;
464 
465  // The sequence that we parse the input into and expose to the user.
466  std::array<std::string_view, 4> sequence_view_;
467  };
468 
469  // ======================================================================================
470  // Main Class
471  // ======================================================================================
472 
473  // -------------------------------------------------------------------------
474  // Constructors and Rule of Five
475  // -------------------------------------------------------------------------
476 
480  FastxInputViewStream()
481  : input_source_( nullptr )
482  {}
483 
487  explicit FastxInputViewStream(
488  std::shared_ptr<utils::BaseInputSource> source
489  )
490  : input_source_( source )
491  {}
492 
493  ~FastxInputViewStream() = default;
494 
495  FastxInputViewStream( self_type const& ) = default;
496  FastxInputViewStream( self_type&& ) = default;
497 
498  self_type& operator= ( self_type const& ) = default;
499  self_type& operator= ( self_type&& ) = default;
500 
501  // -------------------------------------------------------------------------
502  // Iteration
503  // -------------------------------------------------------------------------
504 
505  Iterator begin() const
506  {
507  return Iterator( this );
508  }
509 
510  Iterator end() const
511  {
512  return Iterator();
513  }
514 
515  // -------------------------------------------------------------------------
516  // Settings
517  // -------------------------------------------------------------------------
518 
519  std::shared_ptr<utils::BaseInputSource> input_source() const
520  {
521  return input_source_;
522  }
523 
524  // -------------------------------------------------------------------------
525  // Data Members
526  // -------------------------------------------------------------------------
527 
528 private:
529 
530  std::shared_ptr<utils::BaseInputSource> input_source_;
531 };
532 
533 } // namespace sequence
534 } // namespace genesis
535 
536 #endif // ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
537 #endif // include guard
genesis::sequence::operator!=
bool operator!=(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:78
std.hpp
Provides some valuable additions to STD.
input_source.hpp
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
input_stream.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
char.hpp
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::sequence::operator==
bool operator==(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:72
sequence.hpp