A library for working with phylogenetic and population genetic data.
v0.32.0
fasta_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <cctype>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 
48 namespace genesis {
49 namespace sequence {
50 
51 // =================================================================================================
52 // Constructor and Rule of Five
53 // =================================================================================================
54 
56 {
57  lookup_.set_all( true );
58 }
59 
60 // =================================================================================================
61 // Reading
62 // =================================================================================================
63 
64 SequenceSet FastaReader::read( std::shared_ptr< utils::BaseInputSource > source ) const
65 {
66  SequenceSet result;
67  utils::InputStream input_stream( source );
68  parse_document_( input_stream, result );
69  return result;
70 }
71 
73  std::shared_ptr< utils::BaseInputSource > source,
74  SequenceSet& sequence_set
75 ) const {
76  utils::InputStream input_stream( source );
77  parse_document_( input_stream, sequence_set );
78 }
79 
80 SequenceDict FastaReader::read_dict( std::shared_ptr<utils::BaseInputSource> source ) const
81 {
82  SequenceDict result;
83  utils::InputStream input_stream( source );
84  parse_document_( input_stream, result );
85  return result;
86 }
87 
89  std::shared_ptr<utils::BaseInputSource> source,
90  bool also_look_up_first_word
91 ) const {
92  ReferenceGenome result;
93  utils::InputStream input_stream( source );
94  parse_document_( input_stream, result, also_look_up_first_word );
95  return result;
96 }
97 
98 // =================================================================================================
99 // Parsing
100 // =================================================================================================
101 
103  utils::InputStream& input_stream,
104  SequenceSet& sequence_set
105 ) const {
106  parse_document_( input_stream, sequence_set );
107 }
108 
110  utils::InputStream& input_stream,
111  Sequence& sequence
112 ) const {
113  // Init. Call clear on the sequence in order to avoid not setting properties that might be added
114  // to Sequence in the future. Should not noticeable affect speed, as the sequence string
115  // capacities should not change when setting the strings to empty strings.
116  auto& it = input_stream;
117  sequence.clear();
118 
119  // Check for data.
120  if( !it ) {
121  return false;
122  }
123 
124  // ---------------------------------------------
125  // Label
126  // ---------------------------------------------
127 
128  // Check beginning of sequence.
129  if( !it || *it != '>' ) {
130  throw std::runtime_error(
131  "Malformed Fasta " + it.source_name()
132  + ": Expecting '>' at beginning of sequence at line " + std::to_string( it.line() ) + "."
133  );
134  }
135  assert( it && *it == '>' );
136  ++it;
137 
138  // Parse label.
139  buffer_.clear();
140  it.get_line( buffer_ );
141  auto const buffer_is_print = std::all_of(
142  buffer_.cbegin(),
143  buffer_.cend(),
144  []( char c ){
145  return utils::is_print( c );
146  }
147  );
148  if( buffer_.empty() || !buffer_is_print ) {
149  throw std::runtime_error(
150  "Malformed Fasta " + it.source_name() + ": Expecting valid label after '>' "
151  "in sequence at line " + std::to_string( it.line() ) + ", but instead the label "
152  "is empty or contains non-printable characters."
153  );
154  }
155  if( guess_abundances_ ) {
156  auto const la = guess_sequence_abundance( buffer_ );
157  sequence.label( la.first );
158  sequence.abundance( la.second );
159  } else {
160  sequence.label( buffer_ );
161  }
162 
163  // ---------------------------------------------
164  // Sites
165  // ---------------------------------------------
166 
167  // Skip comments.
168  while( it && *it == ';' ) {
169  utils::skip_until( it, '\n' );
170  assert( it && *it == '\n' );
171  ++it;
172  }
173 
174  // Check for unexpected end of file.
175  if( !it ) {
176  throw std::runtime_error(
177  "Malformed Fasta " + it.source_name()
178  + ": Expecting a sequence after the label line in sequence at line "
179  + std::to_string( it.line() - 1 ) + "."
180  );
181  }
182  assert( it );
183 
184  // Use some tmp memory. We will later copy the content, so that superfluous capacity
185  // is stripped. We could do a sites.reserve( ... ) here, but this yields only minor speedups.
186  buffer_.clear();
187  // buffer_.reserve( n );
188 
189  // Parse sequence. At every beginning of the loop, we are at a line start.
190  while( it && *it != '>' ) {
191  assert( it.column() == 1 );
192  it.get_line( buffer_ );
193  }
194  assert( !it || *it == '>' );
195 
196  if( buffer_.length() == 0 ) {
197  throw std::runtime_error(
198  "Malformed Fasta " + it.source_name() + ": Empty sequence at line "
199  + std::to_string( it.line() - 1 ) + "."
200  );
201  }
202 
203  if( site_casing_ == SiteCasing::kToUpper ) {
204  sequence.sites() = utils::to_upper_ascii( buffer_ );
205  } else if( site_casing_ == SiteCasing::kToLower ) {
206  sequence.sites() = utils::to_lower_ascii( buffer_ );
207  } else {
208  // We could do a move here instead, but this way, we save some memory, which might be
209  // more reasonable for big sequences files than the small gain in speed.
210  // sequence.sites() = std::move(sites);
211  sequence.sites() = buffer_;
212  }
213 
214  if( use_validation_ ) {
215  for( auto const& c : sequence.sites() ) {
216  if( !lookup_[c] ) {
217  throw std::runtime_error(
218  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
219  + utils::char_to_hex( c )
220  + " in the sequence at/above line " + std::to_string( it.line() - 1 ) + "."
221  );
222  }
223  }
224  }
225 
226  return true;
227 }
228 
230  utils::InputStream& input_stream,
231  Sequence& sequence
232 ) const {
233  // Init. Call clear in order to avoid not setting properties that might be added to
234  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
235  // should not change when setting the strings to empty strings.
236  auto& it = input_stream;
237  sequence.clear();
238 
239  // Check for data.
240  if( !it ) {
241  return false;
242  }
243 
244  // Check beginning of sequence.
245  if( it.current() != '>' ) {
246  throw std::runtime_error(
247  "Malformed Fasta " + it.source_name()
248  + ": Expecting '>' at beginning of sequence at " + it.at() + "."
249  );
250  }
251  assert( it && *it == '>' );
252  ++it;
253 
254  // Parse label.
255  std::string label = utils::read_while( it, isprint );
256  if( label == "" ) {
257  throw std::runtime_error(
258  "Malformed Fasta " + it.source_name()
259  + ": Expecting label after '>' at " + it.at() + "."
260  );
261  }
262  if( guess_abundances_ ) {
263  auto const la = guess_sequence_abundance( label );
264  sequence.label( la.first );
265  sequence.abundance( la.second );
266  } else {
267  sequence.label( label );
268  }
269 
270  // Check for unexpected end of stream.
271  if( !it || ( *it != '\n' )) {
272  throw std::runtime_error(
273  "Malformed Fasta " + it.source_name()
274  + ": Expecting a sequence after the label line at " + it.at() + "."
275  );
276  }
277  assert( it && (*it == '\n' ));
278 
279  // Check for unexpected end of file.
280  if( !it || *it != '\n' ) {
281  throw std::runtime_error(
282  "Malformed Fasta " + it.source_name()
283  + ": Expecting a sequence after the label line at " + it.at() + "."
284  );
285  }
286  assert( it && *it == '\n' );
287 
288  // Skip comments.
289  while( it && *it == ';' ) {
290  utils::skip_while( it, isprint );
291  }
292 
293  // Check for unexpected end of file.
294  if( !it || *it != '\n' ) {
295  throw std::runtime_error(
296  "Malformed Fasta " + it.source_name()
297  + ": Expecting a sequence after the label line at " + it.at() + "."
298  );
299  }
300  assert( it && *it == '\n' );
301  ++it;
302 
303  // Parse sequence. At every beginning of the outer loop, we are at a line start.
304  std::string sites;
305  while( it && *it != '>' ) {
306  assert( it.column() == 1 );
307 
308  size_t count = 0;
309  while( it && *it != '\n' ) {
310 
311  // Weird C relicts need weird conversions...
312  // See https://en.cppreference.com/w/cpp/string/byte/tolower
313  char c = *it;
314  if( site_casing_ == SiteCasing::kToUpper ) {
315  c = static_cast<char>( std::toupper( static_cast<unsigned char>( c )));
316  } else if( site_casing_ == SiteCasing::kToLower ) {
317  c = static_cast<char>( std::tolower( static_cast<unsigned char>( c )));
318  }
319  if( use_validation_ && ! lookup_[c] ) {
320  throw std::runtime_error(
321  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
322  + utils::char_to_hex( c ) + " in sequence at " + it.at() + "."
323  );
324  }
325 
326  sites += c;
327  ++it;
328  ++count;
329  }
330 
331  if( count == 0 ) {
332  throw std::runtime_error(
333  "Malformed Fasta " + it.source_name()
334  + ": Empty sequence line at " + it.at() + "."
335  );
336  }
337 
338  if( !it ) {
339  throw std::runtime_error(
340  "Malformed Fasta " + it.source_name()
341  + ": Sequence line does not end with '\\n' at " + it.at() + "."
342  );
343  }
344  assert( it && *it == '\n' );
345  ++it;
346  }
347  assert( !it || *it == '>' );
348 
349  if( sites.length() == 0 ) {
350  throw std::runtime_error(
351  "Malformed Fasta " + it.source_name()
352  + ": Empty sequence at " + it.at() + "."
353  );
354  }
355 
356  // Copy the sequence. We do not use move here, as we can save some memory this way.
357  sequence.sites() = sites;
358 
359  return true;
360 }
361 
362 // =================================================================================================
363 // Properties
364 // =================================================================================================
365 
367 {
368  parsing_method_ = value;
369  return *this;
370 }
371 
373 {
374  return parsing_method_;
375 }
376 
378 {
379  site_casing_ = value;
380  return *this;
381 }
382 
384 {
385  return site_casing_;
386 }
387 
389 {
390  guess_abundances_ = value;
391  return *this;
392 }
393 
395 {
396  return guess_abundances_;
397 }
398 
399 FastaReader& FastaReader::valid_chars( std::string const& chars )
400 {
401  if( chars.size() == 0 ) {
402  lookup_.set_all( true );
403  use_validation_ = false;
404  } else {
405  lookup_.set_all( false );
406  lookup_.set_selection( chars, true );
407  use_validation_ = true;
408  }
409 
410  return *this;
411 }
412 
413 std::string FastaReader::valid_chars() const
414 {
415  // We need to check the valid chars lookup here, because we don't want to return a string
416  // of _all_ chars.
417  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
418  return "";
419  } else {
420  return lookup_.get_chars_equal_to( true );
421  }
422 }
423 
425 {
426  return lookup_;
427 }
428 
429 } // namespace sequence
430 } // namespace genesis
genesis::sequence::FastaReader::SiteCasing::kToLower
@ kToLower
Make all sites lower case.
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::sequence::Sequence::clear
void clear()
Definition: sequence/sequence.hpp:78
fasta_reader.hpp
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::sequence::FastaReader::SiteCasing
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Definition: fasta_reader.hpp:138
fs.hpp
Provides functions for accessing the file system.
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::utils::CharLookup::all_equal_to
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Definition: char_lookup.hpp:242
labels.hpp
genesis::sequence::FastaReader::read_reference_genome
ReferenceGenome read_reference_genome(std::shared_ptr< utils::BaseInputSource > source, bool also_look_up_first_word=true) const
Read all Sequences from an input source in fasta format into a ReferenceGenome.
Definition: fasta_reader.cpp:88
genesis::sequence::FastaReader
Read Fasta sequence data.
Definition: fasta_reader.hpp:91
genesis::sequence::FastaReader::read_dict
SequenceDict read_dict(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in fasta format, but only return their names and lengths as a...
Definition: fasta_reader.cpp:80
genesis::sequence::FastaReader::parse_sequence_pedantic
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Definition: fasta_reader.cpp:229
std.hpp
Provides some valuable additions to STD.
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::sequence::Sequence::abundance
size_t abundance() const
Definition: sequence/sequence.hpp:150
genesis::utils::to_upper_ascii
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
Definition: string.cpp:919
genesis::sequence::FastaReader::guess_abundances
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
Definition: fasta_reader.cpp:394
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
genesis::sequence::FastaReader::FastaReader
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
Definition: fasta_reader.cpp:55
genesis::utils::CharLookup< bool >
genesis::utils::CharLookup::get_chars_equal_to
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Definition: char_lookup.hpp:228
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:53
genesis::sequence::FastaReader::site_casing
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
Definition: fasta_reader.cpp:383
genesis::sequence::FastaReader::valid_chars
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
Definition: fasta_reader.cpp:413
char.hpp
genesis::utils::skip_until
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one.
Definition: scanner.hpp:184
genesis::sequence::FastaReader::valid_char_lookup
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
Definition: fasta_reader.cpp:424
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::utils::to_lower_ascii
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:895
genesis::sequence::FastaReader::read
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fasta format and return them as a SequenceSet.
Definition: fasta_reader.cpp:64
genesis::sequence::FastaReader::parsing_method
ParsingMethod parsing_method() const
Return the currently set parsing method.
Definition: fasta_reader.cpp:372
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::sequence::FastaReader::parse_sequence
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Definition: fasta_reader.cpp:109
scanner.hpp
genesis::utils::CharLookup::set_all
void set_all(T value)
Set the lookup status for all chars at once.
Definition: char_lookup.hpp:192
genesis::sequence::guess_sequence_abundance
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
Definition: labels.cpp:72
genesis::sequence::ReferenceGenome
Lookup of Sequences of a reference genome.
Definition: reference_genome.hpp:65
genesis::sequence::FastaReader::SiteCasing::kToUpper
@ kToUpper
Make all sites upper case.
genesis::sequence::FastaReader::ParsingMethod
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
Definition: fasta_reader.hpp:102
genesis::utils::CharLookup::set_selection
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
Definition: char_lookup.hpp:157
genesis::sequence::FastaReader::parse_document
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
Definition: fasta_reader.cpp:102