A library for working with phylogenetic and population genetic data.
v0.27.0
phylip_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cctype>
45 #include <fstream>
46 #include <sstream>
47 #include <stdexcept>
48 
49 namespace genesis {
50 namespace sequence {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
57 {
58  lookup_.set_all( true );
59 }
60 
61 // =================================================================================================
62 // Reading
63 // =================================================================================================
64 
65 SequenceSet PhylipReader::read( std::shared_ptr<utils::BaseInputSource> source ) const
66 {
67  // Create a new set and fill it.
68  SequenceSet result;
69  read( source, result );
70  return result;
71 }
72 
73 void PhylipReader::read( std::shared_ptr<utils::BaseInputSource> source, SequenceSet& target ) const
74 {
75  utils::InputStream it( source );
76  switch( mode_ ) {
77  case Mode::kSequential: {
78  parse_phylip_sequential( it, target );
79  break;
80  }
81  case Mode::kInterleaved: {
82  parse_phylip_interleaved( it, target );
83  break;
84  }
85  default: {
86  // We already covered all cases above. This cannot happen.
87  assert( false );
88  }
89  }
90 }
91 
92 // =================================================================================================
93 // Parsing
94 // =================================================================================================
95 
97 {
98  Header result;
99 
100  // Read number of sequences.
101  utils::skip_while( it, ::isblank );
102  std::string num_seq_str = utils::read_while( it, ::isdigit );
103  if( num_seq_str.length() == 0 ) {
104  throw std::runtime_error(
105  "Malformed Phylip " + it.source_name() + ": Expecting sequence number at "
106  + it.at() + "."
107  );
108  }
109  result.num_sequences = std::stoi( num_seq_str );
110 
111  // Read length of sequences.
112  utils::skip_while( it, ::isblank );
113  std::string len_seq_str = utils::read_while( it, ::isdigit );
114  if( len_seq_str.length() == 0 ) {
115  throw std::runtime_error(
116  "Malformed Phylip " + it.source_name() + ": Expecting sequence length at "
117  + it.at() + "."
118  );
119  }
120  result.len_sequences = std::stoi( len_seq_str );
121 
122  // Sanity check.
123  if( result.num_sequences == 0 || result.len_sequences == 0 ) {
124  throw std::runtime_error(
125  "Malformed Phylip " + it.source_name() + ": Sequences are empty."
126  );
127  }
128 
129  // Process end of header line and proceed to first non-empty line.
130  utils::skip_while( it, ::isblank );
132  if( !it || *it != '\n' ) {
133  throw std::runtime_error(
134  "Malformed Phylip " + it.source_name() + ": Expecting end of line at " + it.at() + "."
135  );
136  }
137  utils::skip_while( it, '\n' );
138 
139  return result;
140 }
141 
143 {
144  std::string label;
145 
146  // Labels need to start with some graphical char.
147  if( !it || ! ::isgraph( *it ) ) {
148  throw std::runtime_error(
149  "Malformed Phylip " + it.source_name() + ": Expecting label at " + it.at() + "."
150  );
151  }
152 
153  // Scan label until first blank/tab.
154  if( label_length_ == 0 ) {
155  label = utils::read_while( it, ::isgraph );
156  if( !it || ! ::isblank( *it ) ) {
157  throw std::runtime_error(
158  "Malformed Phylip " + it.source_name() + ": Expecting delimiting white space at "
159  + it.at() + "."
160  );
161  }
162  utils::skip_while( it, ::isblank );
163 
164  // Scan label for `label_length` many chars.
165  } else {
166  for( size_t i = 0; i < label_length_; ++i ) {
167  if( !it || ! ::isprint( *it ) ) {
168  throw std::runtime_error(
169  "Malformed Phylip " + it.source_name() + ": Invalid label at " + it.at() + "."
170  );
171  }
172 
173  label += *it;
174  ++it;
175  }
176  label = utils::trim( label );
177  }
178 
179  label = utils::trim( label );
180  assert( label.size() > 0 );
181  return label;
182 }
183 
185 {
186  // Read the (rest of) the current line from the input.
187  auto seq = it.get_line();
188 
189  // Clean up as needed. Blanks always, digits only on demand.
190  utils::erase_if( seq, []( char c ){
191  return c == ' ' || c == '\t';
192  });
193  if( remove_digits_ ) {
194  utils::erase_if( seq, []( char c ){
195  return ::isdigit( c );
196  });
197  }
198 
199  // Change case as needed.
200  if( site_casing_ == SiteCasing::kToUpper ) {
201  seq = utils::to_upper_ascii( seq );
202  } else if( site_casing_ == SiteCasing::kToLower ) {
203  seq = utils::to_lower_ascii( seq );
204  }
205 
206  // Validate as needed.
207  if( use_validation_ ) {
208  for( auto const& c : seq ) {
209  if( !lookup_[c] ) {
210  throw std::runtime_error(
211  "Malformed Phylip " + it.source_name() + ": Invalid sequence symbol "
212  + utils::char_to_hex( c )
213  + " in sequence near line " + std::to_string( it.line() - 1 ) + "."
214  );
215  }
216  }
217  }
218 
219  return seq;
220 }
221 
223 {
224  // Parse header line.
225  auto header = parse_phylip_header( it );
226  size_t num_seq = header.num_sequences;
227  size_t len_seq = header.len_sequences;
228 
229  // Process the given number of sequences. If there are not enough, the inner functions will
230  // throw. If there are too many, the check at the end will throw.
231  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
232  assert( it.column() == 1 );
233  Sequence seq;
234 
235  // Parse label.
236  seq.label( parse_phylip_label( it ) );
237 
238  // Parse sequence. As long as we did not read as many sites as the header claimed, we read
239  // more lines from the input stream. If we then read too many chars (checked in the next
240  // step), the file is ill formatted. This is because a sequence always has to end with \n,
241  // and the label of the next sequence always has to start at the beginning of the line.
242  seq.sites().reserve( len_seq );
243  while( seq.sites().length() < len_seq ) {
244  seq.sites() += parse_phylip_sequence_line( it );
245  assert( it.column() == 1 );
246  }
247 
248  // Check sequence length.
249  if( seq.sites().length() > len_seq ) {
250  throw std::runtime_error(
251  "Malformed Phylip " + it.source_name() + ": Sequence with length "
252  + std::to_string( seq.sites().length() ) + " instead of "
253  + std::to_string( len_seq ) + " at " + it.at() + "."
254  );
255  }
256  assert( seq.sites().length() == len_seq );
257 
258  // Add to set.
259  sset.add( seq );
260  }
261 
262  // Final checks.
263  utils::skip_while( it, isspace );
264  if( it ) {
265  throw std::runtime_error(
266  "Malformed Phylip " + it.source_name() + ": Expected end of file at " + it.at() + "."
267  );
268  }
269  assert( sset.size() == num_seq );
270 }
271 
273 {
274  // Parse header line.
275  auto header = parse_phylip_header( it );
276  size_t num_seq = header.num_sequences;
277  size_t len_seq = header.len_sequences;
278 
279  // Helper function that checks the sequence length and throws if it is too long.
280  auto check_seq_len = [ &it, &len_seq ] ( Sequence const& seq ) {
281  if( seq.length() > len_seq ) {
282  throw std::runtime_error(
283  "Malformed Phylip " + it.source_name() + ": Sequence with length "
284  + std::to_string( seq.sites().length() ) + " instead of "
285  + std::to_string( len_seq ) + " at " + it.at() + "."
286  );
287  }
288  };
289 
290  // Process the first block, which contains the labels.
291  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
292  assert( it.column() == 1 );
293  Sequence seq;
294 
295  // Parse label.
296  seq.label( parse_phylip_label( it ) );
297 
298  // Reserve mem and parse first part of sequence.
299  seq.sites().reserve( len_seq );
300  seq.sites() += parse_phylip_sequence_line( it );
301  check_seq_len( seq );
302 
303  // Add to set.
304  sset.add( seq );
305  }
306 
307  // Helper function that checks whether there are still sequences in the set that are not yet
308  // done (i.e., don't have `len_seq` length).
309  auto unfinished_sequences = [ & ] () {
310  for( auto const& seq : sset ) {
311  assert( seq.length() <= len_seq );
312  if( seq.length() < len_seq ) {
313  return true;
314  }
315  }
316  return false;
317  };
318 
319  while( unfinished_sequences() ) {
320  // Each block might start with an empty line. Skip.
321  if( !it ) {
322  throw std::runtime_error(
323  "Malformed Phylip " + it.source_name() + ": Unexpected end of file at "
324  + it.at() + "."
325  );
326  }
327  if( *it == '\n' ) {
328  ++it;
329  }
330 
331  // Parse the next block.
332  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
333  assert( it.column() == 1 );
334  sset[seq_n].sites() += parse_phylip_sequence_line( it );
335  check_seq_len( sset[seq_n] );
336  }
337  }
338 
339  assert( sset.size() == num_seq );
340 }
341 
342 // =================================================================================================
343 // Properties
344 // =================================================================================================
345 
347 {
348  mode_ = value;
349  return *this;
350 }
351 
353 {
354  return mode_;
355 }
356 
358 {
359  label_length_ = value;
360  return *this;
361 }
362 
364 {
365  return label_length_;
366 }
367 
369 {
370  site_casing_ = value;
371  return *this;
372 }
373 
375 {
376  return site_casing_;
377 }
378 
380 {
381  remove_digits_ = value;
382  return *this;
383 }
384 
386 {
387  return remove_digits_;
388 }
389 
390 PhylipReader& PhylipReader::valid_chars( std::string const& chars )
391 {
392  if( chars.size() == 0 ) {
393  lookup_.set_all( true );
394  use_validation_ = false;
395  } else {
396  lookup_.set_all( false );
397  lookup_.set_selection( chars, true );
398  use_validation_ = true;
399  }
400 
401  return *this;
402 }
403 
404 std::string PhylipReader::valid_chars() const
405 {
406  // We need to check the valid chars lookup here, because we don't want to return a string
407  // of _all_ chars.
408  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
409  return "";
410  } else {
411  return lookup_.get_chars_equal_to( true );
412  }
413 }
414 
416 {
417  return lookup_;
418 }
419 
420 } // namespace sequence
421 } // namespace genesis
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:481
genesis::sequence::PhylipReader::SiteCasing::kToLower
@ kToLower
Make all sites lower case.
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::sequence::PhylipReader::SiteCasing::kToUpper
@ kToUpper
Make all sites upper case.
genesis::utils::InputStream::column
size_t column() const
Return the current column of the input stream.
Definition: input_stream.hpp:472
genesis::sequence::PhylipReader::parse_phylip_interleaved
void parse_phylip_interleaved(utils::InputStream &it, SequenceSet &sset) const
Parse a whole Phylip file using the interleaved variant (Mode::kInterleaved).
Definition: phylip_reader.cpp:272
genesis::sequence::PhylipReader::SiteCasing
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Definition: phylip_reader.hpp:143
genesis::utils::trim_right
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:578
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:522
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
fs.hpp
Provides functions for accessing the file system.
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::PhylipReader::valid_char_lookup
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
Definition: phylip_reader.cpp:415
genesis::utils::CharLookup::all_equal_to
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Definition: char_lookup.hpp:242
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:602
genesis::sequence::PhylipReader::parse_phylip_label
std::string parse_phylip_label(utils::InputStream &it) const
Parse and return a Phylip label.
Definition: phylip_reader.cpp:142
sequence_set.hpp
std.hpp
Provides some valuable additions to STD.
genesis::sequence::PhylipReader::site_casing
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
Definition: phylip_reader.cpp:374
genesis::sequence::SequenceSet::size
size_t size() const
Definition: sequence_set.cpp:52
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::sequence::PhylipReader::Header
Helper that stores the header information of a Phylip file.
Definition: phylip_reader.hpp:97
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::sequence::PhylipReader::read
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Phylip format and return them as a SequenceSet.
Definition: phylip_reader.cpp:65
genesis::sequence::PhylipReader::parse_phylip_header
Header parse_phylip_header(utils::InputStream &it) const
Parse a Phylip header and return the contained sequence count and length.
Definition: phylip_reader.cpp:96
genesis::sequence::PhylipReader
Read Phylip sequence data.
Definition: phylip_reader.hpp:86
genesis::utils::erase_if
void erase_if(Container &c, UnaryPredicate p)
Erases all elements from the container that satisfy a given predicate. An element is erased,...
Definition: algorithm.hpp:107
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::utils::to_upper_ascii
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
Definition: string.cpp:692
genesis::sequence::PhylipReader::Mode
Mode
Enum to distinguish between the different file variants of Phylip. See mode( Mode value ) for more de...
Definition: phylip_reader.hpp:127
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
genesis::sequence::PhylipReader::Header::len_sequences
size_t len_sequences
Length of the sequences in the Phylip file.
Definition: phylip_reader.hpp:107
genesis::utils::CharLookup< bool >
genesis::utils::CharLookup::get_chars_equal_to
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Definition: char_lookup.hpp:228
genesis::sequence::PhylipReader::parse_phylip_sequence_line
std::string parse_phylip_sequence_line(utils::InputStream &it) const
Parse one sequence line.
Definition: phylip_reader.cpp:184
genesis::sequence::PhylipReader::parse_phylip_sequential
void parse_phylip_sequential(utils::InputStream &it, SequenceSet &sset) const
Parse a whole Phylip file using the sequential variant (Mode::kSequential).
Definition: phylip_reader.cpp:222
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::PhylipReader::Mode::kInterleaved
@ kInterleaved
Read the data in Phylip interleaved mode.
genesis::utils::InputStream::get_line
void get_line(std::string &target)
Read the current line, append it to the target, and move to the beginning of the next line.
Definition: input_stream.cpp:88
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:59
genesis::sequence::SequenceSet::add
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
Definition: sequence_set.cpp:86
genesis::sequence::PhylipReader::label_length
size_t label_length() const
Return the currently set label length.
Definition: phylip_reader.cpp:363
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::utils::InputStream::line
size_t line() const
Return the current line of the input stream.
Definition: input_stream.hpp:461
genesis::utils::to_lower_ascii
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:668
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
scanner.hpp
genesis::sequence::PhylipReader::Header::num_sequences
size_t num_sequences
Number of sequences in the Phylip file.
Definition: phylip_reader.hpp:102
genesis::sequence::PhylipReader::Header::options
std::string options
Store the options that might be at the end of the header line.
Definition: phylip_reader.hpp:120
genesis::sequence::PhylipReader::mode
Mode mode() const
Definition: phylip_reader.cpp:352
genesis::utils::CharLookup::set_all
void set_all(T value)
Set the lookup status for all chars at once.
Definition: char_lookup.hpp:192
phylip_reader.hpp
genesis::sequence::PhylipReader::PhylipReader
PhylipReader()
Create a default PhylipReader. Per default, chars are turned upper case, but not validated.
Definition: phylip_reader.cpp:56
genesis::utils::CharLookup::set_selection
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
Definition: char_lookup.hpp:157
sequence.hpp
genesis::utils::read_to_end_of_line
std::string read_to_end_of_line(InputStream &source)
Lexing function that reads until the end of the line (i.e., to the new line char),...
Definition: scanner.hpp:134
genesis::sequence::PhylipReader::remove_digits
bool remove_digits() const
Return whether digits are removed from the Sequence.
Definition: phylip_reader.cpp:385
genesis::sequence::PhylipReader::valid_chars
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
Definition: phylip_reader.cpp:404
genesis::sequence::PhylipReader::Mode::kSequential
@ kSequential
Read the data in Phylip sequential mode.