A library for working with phylogenetic data.
v0.25.0
gff_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <cassert>
38 #include <limits>
39 #include <stdexcept>
40 
41 namespace genesis {
42 namespace population {
43 
44 // =================================================================================================
45 // Reading & Parsing
46 // =================================================================================================
47 
48 std::vector<GffReader::Feature> GffReader::read(
49  std::shared_ptr< utils::BaseInputSource > source
50 ) const {
51  std::vector<GffReader::Feature> result;
52  utils::InputStream it( source );
53 
54  Feature feat;
55  while( parse_line( it, feat ) ) {
56  result.push_back( std::move( feat ));
57  }
58  return result;
59 }
60 
62  utils::InputStream& input_stream,
63  GffReader::Feature& feature
64 ) const {
65  // Shorthand.
66  auto& it = input_stream;
67  if( !it ) {
68  return false;
69  }
70 
71  // In the following, whenever we use one of the utils stream reading functions, those
72  // also check for end of stream and throw whenever their expected format is not found
73  // (e.g., when parsing a number, there has to be a number - empty/end of stream will throw).
74  // So, in all these cases, we do not need additional checks here; we hence only add checks
75  // for end of stream etc when we read the chars ourselves here (e.g., for the strand).
76 
77  // Read seqname, source, and feature.
78  // We use \n as stopping criterion here as well, so that in case of an error,
79  // we at least report the error in the correct line.
80  feature.seqname = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
81  utils::read_char_or_throw( it, '\t' );
82  feature.source = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
83  utils::read_char_or_throw( it, '\t' );
84  feature.feature = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
85  utils::read_char_or_throw( it, '\t' );
86 
87  // Read start and end
88  feature.start = utils::parse_unsigned_integer<size_t>( it );
89  utils::read_char_or_throw( it, '\t' );
90  feature.end = utils::parse_unsigned_integer<size_t>( it );
91  utils::read_char_or_throw( it, '\t' );
92 
93  // Read score, allowing for it to be empty
94  if( it && *it == '.' ) {
95  feature.score = std::numeric_limits<double>::quiet_NaN();
96  ++it;
97  } else {
98  feature.score = utils::parse_float<double>( it );
99  }
100  utils::read_char_or_throw( it, '\t' );
101 
102  // Read strand
103  if( !it ) {
104  throw std::runtime_error(
105  std::string("In ") + it.source_name() + ": Unexpected end of input at " + it.at()
106  );
107  }
108  feature.strand = *it;
109  ++it;
110  utils::read_char_or_throw( it, '\t' );
111 
112  // Read frame, allowing for it to be empty
113  if( it && *it == '.' ) {
114  feature.frame = -1;
115  ++it;
116  } else {
117  feature.frame = utils::parse_unsigned_integer<signed char>( it );
118  }
119 
120  // There might be no attributes, and the line might end early.
121  // Otherwise, there needs to be a tab befor the attributes.
122  if( !it || *it == '\n' ) {
123  ++it;
124  return true;
125  }
126  utils::read_char_or_throw( it, '\t' );
127 
128  // Read attributes. GFF and GTF are slightly different, it seems, one with `=` between
129  // key and value, the other with space, and with a semicolon at the end. We here allow
130  // for all these variants.
131  // GFF: `hid=trf; hstart=1; hend=21`
132  // GTF: `gene_id "ENSG00000223972"; gene_name "DDX11L1";`
133  while( it && *it != '\n' ) {
134  // Read key
135  utils::skip_while( it, ' ' );
136  std::string key = utils::read_while( it, []( char c ){
137  return c != '=' && c != ' ' && c != '\n';
138  });
139  utils::read_char_or_throw( it, []( char c ){ return c == '=' || c == ' '; });
140  if( !it || *it == '\n' ) {
141  throw std::runtime_error(
142  std::string("In ") + it.source_name() +
143  ": Unexpected end of line after attribute key at " + it.at()
144  );
145  }
146 
147  // Read value
148  std::string value;
149  assert( it );
150  if( *it == '"' ) {
151  value = utils::parse_quoted_string( it );
152  } else {
153  value = utils::read_while( it, []( char c ){ return c != ';' && c != '\n'; });
154  }
155 
156  // Store key value pair
157  feature.attributes.emplace_back( std::move( key ), std::move( value ));
158 
159  // The attributes end with a closing `;`, which we just want to skip
160  if( it && *it == ';' ) {
161  ++it;
162  }
163  utils::skip_while( it, ' ' );
164  }
165 
166  assert( !it || *it == '\n' );
167  ++it;
168  return true;
169 }
170 
171 } // namespace population
172 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:80
parser.hpp
genesis::population::GffReader::Feature::feature
std::string feature
Definition: gff_reader.hpp:72
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::GffReader::Feature::frame
signed char frame
Definition: gff_reader.hpp:77
genesis::population::GffReader::Feature::source
std::string source
Definition: gff_reader.hpp:71
genesis::population::GffReader::Feature::start
size_t start
Definition: gff_reader.hpp:73
genesis::population::GffReader::parse_line
bool parse_line(utils::InputStream &input_stream, Feature &feature) const
Definition: gff_reader.cpp:61
genesis::population::GffReader::Feature
Definition: gff_reader.hpp:68
genesis::population::GffReader::Feature::seqname
std::string seqname
Definition: gff_reader.hpp:70
genesis::population::GffReader::Feature::attributes
std::vector< Attribute > attributes
Definition: gff_reader.hpp:78
genesis::utils::read_char_or_throw
char read_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: scanner.hpp:299
genesis::population::GffReader::read
std::vector< Feature > read(std::shared_ptr< utils::BaseInputSource > source) const
Definition: gff_reader.cpp:48
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GffReader::Feature::end
size_t end
Definition: gff_reader.hpp:74
char.hpp
scanner.hpp
gff_reader.hpp
genesis::utils::parse_quoted_string
std::string parse_quoted_string(utils::InputStream &source, bool use_escapes, bool use_twin_quotes, bool include_qmarks)
Read a string in quotation marks from a stream and return it.
Definition: parser.cpp:116
genesis::population::GffReader::Feature::strand
char strand
Definition: gff_reader.hpp:76
genesis::population::GffReader::Feature::score
double score
Definition: gff_reader.hpp:75