A library for working with phylogenetic and population genetic data.
v0.32.0
fasta_reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_FORMATS_FASTA_READER_H_
2 #define GENESIS_SEQUENCE_FORMATS_FASTA_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
40 
41 #include <iosfwd>
42 #include <memory>
43 #include <string>
44 
45 namespace genesis {
46 
47 // =================================================================================================
48 // Forward Declarations
49 // =================================================================================================
50 
51 namespace utils {
52  class InputStream;
53 }
54 
55 // =================================================================================================
56 // Fasta Reader
57 // =================================================================================================
58 
59 namespace sequence {
60 
92 {
93 public:
94 
95  // ---------------------------------------------------------------------
96  // Typedefs and Enums
97  // ---------------------------------------------------------------------
98 
102  enum class ParsingMethod
103  {
116  kDefault,
117 
132  kPedantic
133  };
134 
138  enum class SiteCasing
139  {
143  kUnchanged,
144 
148  kToUpper,
149 
153  kToLower
154  };
155 
156  // ---------------------------------------------------------------------
157  // Constructor and Rule of Five
158  // ---------------------------------------------------------------------
159 
165  FastaReader();
166  ~FastaReader() = default;
167 
168  FastaReader( FastaReader const& ) = default;
169  FastaReader( FastaReader&& ) = default;
170 
171  FastaReader& operator= ( FastaReader const& ) = default;
172  FastaReader& operator= ( FastaReader&& ) = default;
173 
174  // ---------------------------------------------------------------------
175  // Reading
176  // ---------------------------------------------------------------------
177 
185  SequenceSet read( std::shared_ptr< utils::BaseInputSource > source ) const;
186 
197  void read( std::shared_ptr< utils::BaseInputSource > source, SequenceSet& sequence_set ) const;
198 
203  SequenceDict read_dict( std::shared_ptr<utils::BaseInputSource> source ) const;
204 
212  std::shared_ptr<utils::BaseInputSource> source,
213  bool also_look_up_first_word = true
214  ) const;
215 
216  // ---------------------------------------------------------------------
217  // Parsing
218  // ---------------------------------------------------------------------
219 
226  void parse_document(
227  utils::InputStream& input_stream,
228  SequenceSet& sequence_set
229  ) const;
230 
242  bool parse_sequence(
243  utils::InputStream& input_stream,
244  Sequence& sequence
245  ) const;
246 
263  utils::InputStream& input_stream,
264  Sequence& sequence
265  ) const;
266 
267  // ---------------------------------------------------------------------
268  // Properties
269  // ---------------------------------------------------------------------
270 
278 
285 
294 
298  SiteCasing site_casing() const;
299 
307  FastaReader& guess_abundances( bool value );
308 
312  bool guess_abundances() const;
313 
331  FastaReader& valid_chars( std::string const& chars );
332 
338  std::string valid_chars() const;
339 
347 
348  // ---------------------------------------------------------------------
349  // Helper Functions
350  // ---------------------------------------------------------------------
351 
352 private:
353 
361  template<class R, typename... A>
362  void parse_document_( utils::InputStream& input_stream, R& result, A... args ) const
363  {
364  Sequence seq;
365  if( parsing_method_ == ParsingMethod::kDefault ) {
366  while( parse_sequence( input_stream, seq ) ) {
367  result.add( std::move(seq), args... );
368  }
369  } else if( parsing_method_ == ParsingMethod::kPedantic ) {
370  while( parse_sequence_pedantic( input_stream, seq ) ) {
371  result.add( std::move(seq), args... );
372  }
373  } else {
374  // There are no other methods currently implemented.
375  assert( false );
376  }
377  }
378 
379  // ---------------------------------------------------------------------
380  // Members
381  // ---------------------------------------------------------------------
382 
383 private:
384 
385  ParsingMethod parsing_method_ = ParsingMethod::kDefault;
386 
387  SiteCasing site_casing_ = SiteCasing::kUnchanged;
388  bool guess_abundances_ = false;
389  bool use_validation_ = false;
390  utils::CharLookup<bool> lookup_;
391 
392  // Internal reading buffer
393  mutable std::string buffer_;
394 };
395 
396 } // namespace sequence
397 } // namespace genesis
398 
399 #endif // include guard
genesis::sequence::FastaReader::SiteCasing::kToLower
@ kToLower
Make all sites lower case.
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::sequence::FastaReader::ParsingMethod::kPedantic
@ kPedantic
Pedantic method.
genesis::sequence::FastaReader::SiteCasing
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Definition: fasta_reader.hpp:138
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::FastaReader::read_reference_genome
ReferenceGenome read_reference_genome(std::shared_ptr< utils::BaseInputSource > source, bool also_look_up_first_word=true) const
Read all Sequences from an input source in fasta format into a ReferenceGenome.
Definition: fasta_reader.cpp:88
sequence_set.hpp
genesis::sequence::FastaReader
Read Fasta sequence data.
Definition: fasta_reader.hpp:91
genesis::sequence::FastaReader::read_dict
SequenceDict read_dict(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in fasta format, but only return their names and lengths as a...
Definition: fasta_reader.cpp:80
genesis::sequence::FastaReader::parse_sequence_pedantic
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Definition: fasta_reader.cpp:229
input_source.hpp
genesis::sequence::FastaReader::~FastaReader
~FastaReader()=default
char_lookup.hpp
sequence_dict.hpp
genesis::sequence::FastaReader::guess_abundances
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
Definition: fasta_reader.cpp:394
genesis::sequence::FastaReader::ParsingMethod::kDefault
@ kDefault
Fast method, used by default.
genesis::sequence::FastaReader::FastaReader
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
Definition: fasta_reader.cpp:55
genesis::utils::CharLookup< bool >
reference_genome.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:53
genesis::sequence::FastaReader::site_casing
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
Definition: fasta_reader.cpp:383
genesis::sequence::FastaReader::valid_chars
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
Definition: fasta_reader.cpp:413
genesis::sequence::FastaReader::operator=
FastaReader & operator=(FastaReader const &)=default
genesis::sequence::FastaReader::valid_char_lookup
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
Definition: fasta_reader.cpp:424
genesis::sequence::FastaReader::read
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fasta format and return them as a SequenceSet.
Definition: fasta_reader.cpp:64
genesis::sequence::FastaReader::parsing_method
ParsingMethod parsing_method() const
Return the currently set parsing method.
Definition: fasta_reader.cpp:372
genesis::sequence::FastaReader::parse_sequence
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Definition: fasta_reader.cpp:109
genesis::sequence::ReferenceGenome
Lookup of Sequences of a reference genome.
Definition: reference_genome.hpp:65
genesis::sequence::FastaReader::SiteCasing::kToUpper
@ kToUpper
Make all sites upper case.
genesis::sequence::FastaReader::SiteCasing::kUnchanged
@ kUnchanged
Do not change the case of the sites.
genesis::sequence::FastaReader::ParsingMethod
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
Definition: fasta_reader.hpp:102
sequence.hpp
genesis::sequence::FastaReader::parse_document
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
Definition: fasta_reader.cpp:102