A toolkit for working with phylogenetic data.
v0.24.0
fasta_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
42 
43 #include <cassert>
44 #include <cctype>
45 #include <fstream>
46 #include <sstream>
47 #include <stdexcept>
48 
49 namespace genesis {
50 namespace sequence {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
57 {
58  lookup_.set_all( true );
59 }
60 
61 // =================================================================================================
62 // Reading
63 // =================================================================================================
64 
65 SequenceSet FastaReader::read( std::shared_ptr< utils::BaseInputSource > source ) const
66 {
67  SequenceSet result;
68  utils::InputStream is( source );
69  parse_document( is, result );
70  return result;
71 }
72 
74  std::shared_ptr< utils::BaseInputSource > source,
75  SequenceSet& sequence_set
76 ) const {
77  utils::InputStream is( source );
78  parse_document( is, sequence_set );
79 }
80 
81 // =================================================================================================
82 // Parsing
83 // =================================================================================================
84 
86  utils::InputStream& input_stream,
87  SequenceSet& sequence_set
88 ) const {
89  Sequence seq;
90 
91  if( parsing_method_ == ParsingMethod::kDefault ) {
92  while( parse_sequence( input_stream, seq ) ) {
93  sequence_set.add( seq );
94  }
95 
96  } else if( parsing_method_ == ParsingMethod::kPedantic ) {
97  while( parse_sequence_pedantic( input_stream, seq ) ) {
98  sequence_set.add( seq );
99  }
100 
101  } else {
102  // There are no other methods currently implemented.
103  assert( false );
104  }
105 }
106 
108  utils::InputStream& input_stream,
109  Sequence& sequence
110 ) const {
111  // Init. Call clear in order to avoid not setting properties that might be added to
112  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
113  // should not change when setting the strings to empty strings.
114  auto& it = input_stream;
115  sequence.clear();
116 
117  // Check for data.
118  if( !it ) {
119  return false;
120  }
121 
122  // ---------------------------------------------
123  // Label
124  // ---------------------------------------------
125 
126  // Scope to ensure that the label line is only used
127  // while we actually are in that line.
128  {
129 
130  // Check beginning of sequence.
131  if( !it || *it != '>' ) {
132  throw std::runtime_error(
133  "Malformed Fasta " + it.source_name()
134  + ": Expecting '>' at beginning of sequence at line " + std::to_string( it.line() ) + "."
135  );
136  }
137  assert( it && *it == '>' );
138  ++it;
139 
140  // Parse label.
141  std::string const label = utils::read_while( it, isprint );
142  if( label == "" ) {
143  throw std::runtime_error(
144  "Malformed Fasta " + it.source_name()
145  + ": Expecting label after '>' in sequence at line " + std::to_string( it.line() ) + "."
146  );
147  }
148  if( guess_abundances_ ) {
149  auto const la = guess_sequence_abundance( label );
150  sequence.label( la.first );
151  sequence.abundance( la.second );
152  } else {
153  sequence.label( label );
154  }
155 
156  // Check for unexpected end of file.
157  if( !it || *it != '\n' ) {
158  throw std::runtime_error(
159  "Malformed Fasta " + it.source_name()
160  + ": Unexpected characters at the end of the label line in sequence at line "
161  + std::to_string( it.line() ) + "."
162  );
163  }
164  assert( it && *it == '\n' );
165  ++it;
166 
167  } // End of line scope. We are done with the label line.
168 
169  // ---------------------------------------------
170  // Sites
171  // ---------------------------------------------
172 
173  // Skip comments.
174  while( it && *it == ';' ) {
175  utils::skip_until( it, '\n' );
176  assert( it && *it == '\n' );
177  ++it;
178  }
179 
180  // Check for unexpected end of file.
181  if( !it ) {
182  throw std::runtime_error(
183  "Malformed Fasta " + it.source_name()
184  + ": Expecting a sequence after the label line in sequence at line "
185  + std::to_string( it.line() - 1 ) + "."
186  );
187  }
188  assert( it );
189 
190  // Reserve some tmp memory. We will later copy the content, so that superfluous capacity
191  // is stripped.
192  // We could do a sites.reserve( ... ) here, but this yields only minor speedups.
193  std::string sites;
194  // sites.reserve( n );
195 
196  // Parse sequence. At every beginning of the loop, we are at a line start.
197  while( it && *it != '>' ) {
198  assert( it.column() == 1 );
199  it.get_line( sites );
200  }
201  assert( !it || *it == '>' );
202 
203  if( sites.length() == 0 ) {
204  throw std::runtime_error(
205  "Malformed Fasta " + it.source_name() + ": Empty sequence at line "
206  + std::to_string( it.line() - 1 ) + "."
207  );
208  }
209 
210  if( site_casing_ == SiteCasing::kToUpper ) {
211  sequence.sites() = utils::to_upper_ascii( sites );
212  } else if( site_casing_ == SiteCasing::kToLower ) {
213  sequence.sites() = utils::to_lower_ascii( sites );
214  } else {
215  // We could do a move here instead, but this way, we save some memory, which might be
216  // more reasonable for big sequences files than the small gain in speed.
217  // sequence.sites() = std::move(sites);
218  sequence.sites() = sites;
219  }
220 
221  if( use_validation_ ) {
222  for( auto const& c : sequence.sites() ) {
223  if( !lookup_[c] ) {
224  throw std::runtime_error(
225  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
226  + utils::char_to_hex( c )
227  + " in the sequence at/above line " + std::to_string( it.line() - 1 ) + "."
228  );
229  }
230  }
231  }
232 
233  return true;
234 }
235 
237  utils::InputStream& input_stream,
238  Sequence& sequence
239 ) const {
240  // Init. Call clear in order to avoid not setting properties that might be added to
241  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
242  // should not change when setting the strings to empty strings.
243  auto& it = input_stream;
244  sequence.clear();
245 
246  // Check for data.
247  if( !it ) {
248  return false;
249  }
250 
251  // Check beginning of sequence.
252  if( it.current() != '>' ) {
253  throw std::runtime_error(
254  "Malformed Fasta " + it.source_name()
255  + ": Expecting '>' at beginning of sequence at " + it.at() + "."
256  );
257  }
258  assert( it && *it == '>' );
259  ++it;
260 
261  // Parse label.
262  std::string label = utils::read_while( it, isprint );
263  if( label == "" ) {
264  throw std::runtime_error(
265  "Malformed Fasta " + it.source_name()
266  + ": Expecting label after '>' at " + it.at() + "."
267  );
268  }
269  if( guess_abundances_ ) {
270  auto const la = guess_sequence_abundance( label );
271  sequence.label( la.first );
272  sequence.abundance( la.second );
273  } else {
274  sequence.label( label );
275  }
276 
277  // Check for unexpected end of stream.
278  if( !it || ( *it != '\n' )) {
279  throw std::runtime_error(
280  "Malformed Fasta " + it.source_name()
281  + ": Expecting a sequence after the label line at " + it.at() + "."
282  );
283  }
284  assert( it && (*it == '\n' ));
285 
286  // Check for unexpected end of file.
287  if( !it || *it != '\n' ) {
288  throw std::runtime_error(
289  "Malformed Fasta " + it.source_name()
290  + ": Expecting a sequence after the label line at " + it.at() + "."
291  );
292  }
293  assert( it && *it == '\n' );
294 
295  // Skip comments.
296  while( it && *it == ';' ) {
297  utils::skip_while( it, isprint );
298  }
299 
300  // Check for unexpected end of file.
301  if( !it || *it != '\n' ) {
302  throw std::runtime_error(
303  "Malformed Fasta " + it.source_name()
304  + ": Expecting a sequence after the label line at " + it.at() + "."
305  );
306  }
307  assert( it && *it == '\n' );
308  ++it;
309 
310  // Parse sequence. At every beginning of the outer loop, we are at a line start.
311  std::string sites;
312  while( it && *it != '>' ) {
313  assert( it.column() == 1 );
314 
315  size_t count = 0;
316  while( it && *it != '\n' ) {
317 
318  // Weird C relicts need weird conversions...
319  // See https://en.cppreference.com/w/cpp/string/byte/tolower
320  char c = *it;
321  if( site_casing_ == SiteCasing::kToUpper ) {
322  c = static_cast<char>( std::toupper( static_cast<unsigned char>( c )));
323  } else if( site_casing_ == SiteCasing::kToLower ) {
324  c = static_cast<char>( std::tolower( static_cast<unsigned char>( c )));
325  }
326  if( use_validation_ && ! lookup_[c] ) {
327  throw std::runtime_error(
328  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
329  + utils::char_to_hex( c ) + " in sequence at " + it.at() + "."
330  );
331  }
332 
333  sites += c;
334  ++it;
335  ++count;
336  }
337 
338  if( count == 0 ) {
339  throw std::runtime_error(
340  "Malformed Fasta " + it.source_name()
341  + ": Empty sequence line at " + it.at() + "."
342  );
343  }
344 
345  if( !it ) {
346  throw std::runtime_error(
347  "Malformed Fasta " + it.source_name()
348  + ": Sequence line does not end with '\\n' at " + it.at() + "."
349  );
350  }
351  assert( it && *it == '\n' );
352  ++it;
353  }
354  assert( !it || *it == '>' );
355 
356  if( sites.length() == 0 ) {
357  throw std::runtime_error(
358  "Malformed Fasta " + it.source_name()
359  + ": Empty sequence at " + it.at() + "."
360  );
361  }
362 
363  // Copy the sequence. We do not use move here, as we can save some memory this way.
364  sequence.sites() = sites;
365 
366  return true;
367 }
368 
369 // =================================================================================================
370 // Properties
371 // =================================================================================================
372 
374 {
375  parsing_method_ = value;
376  return *this;
377 }
378 
380 {
381  return parsing_method_;
382 }
383 
385 {
386  site_casing_ = value;
387  return *this;
388 }
389 
391 {
392  return site_casing_;
393 }
394 
396 {
397  guess_abundances_ = value;
398  return *this;
399 }
400 
402 {
403  return guess_abundances_;
404 }
405 
406 FastaReader& FastaReader::valid_chars( std::string const& chars )
407 {
408  if( chars.size() == 0 ) {
409  lookup_.set_all( true );
410  use_validation_ = false;
411  } else {
412  lookup_.set_all( false );
413  lookup_.set_selection( chars, true );
414  use_validation_ = true;
415  }
416 
417  return *this;
418 }
419 
420 std::string FastaReader::valid_chars() const
421 {
422  // We need to check the valid chars lookup here, because we don't want to return a string
423  // of _all_ chars.
424  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
425  return "";
426  } else {
427  return lookup_.get_chars_equal_to( true );
428  }
429 }
430 
432 {
433  return lookup_;
434 }
435 
436 } // namespace sequence
437 } // namespace genesis
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:153
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void set_all(T value)
Set the lookup status for all chars at once.
Read Fasta sequence data.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
Definition: string.cpp:484
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Provides some valuable additions to STD.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:216
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Provides some commonly used string utility functions.
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
Provides functions for accessing the file system.
Store a set of Sequences.
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it&#39;s label.
Definition: labels.cpp:72
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fasta format and return them as a SequenceSet.
ParsingMethod parsing_method() const
Return the currently set parsing method.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
Definition: scanner.hpp:184
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:460
Stream interface for reading data from an InputSource, that keeps track of line and column counters...