A library for working with phylogenetic and population genetic data.
v0.27.0
gff_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
38 
39 #include <cassert>
40 #include <cstring>
41 #include <limits>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Reading
49 // =================================================================================================
50 
51 std::vector<GffReader::Feature> GffReader::read(
52  std::shared_ptr< utils::BaseInputSource > source
53 ) const {
54  std::vector<GffReader::Feature> result;
55  utils::InputStream it( source );
56  Feature feat;
57  while( parse_line( it, feat ) ) {
58  result.push_back( std::move( feat ));
59  }
60  return result;
61 }
62 
64  std::shared_ptr< utils::BaseInputSource > source,
65  bool merge
66 ) const {
67  GenomeRegionList result;
68  read_as_genome_region_list( source, result, merge );
69  return result;
70 }
71 
73  std::shared_ptr< utils::BaseInputSource > source,
74  GenomeRegionList& target,
75  bool merge
76 ) const {
77  utils::InputStream it( source );
78  Feature feat;
79  while( parse_line( it, feat ) ) {
80  target.add( feat.seqname, feat.start, feat.end, merge );
81  }
82 }
83 
84 // =================================================================================================
85 // Parsing
86 // =================================================================================================
87 
89  utils::InputStream& input_stream,
90  GffReader::Feature& feature
91 ) const {
92  // Shorthand.
93  auto& it = input_stream;
94  if( !it ) {
95  return false;
96  }
97 
98  // In the following, whenever we use one of the utils stream reading functions, those
99  // also check for end of stream and throw whenever their expected format is not found
100  // (e.g., when parsing a number, there has to be a number - empty/end of stream will throw).
101  // So, in all these cases, we do not need additional checks here; we hence only add checks
102  // for end of stream etc when we read the chars ourselves here (e.g., for the strand).
103 
104  // Track lines are ignored. We look for the word "track " with a space.
105  // If found, we skip the line.
106  while( true ) {
107  auto const buff = it.buffer();
108  if(
109  ( buff.second >= 6 && strncmp( buff.first, "track ", 6 ) == 0 ) ||
110  ( buff.second >= 8 && strncmp( buff.first, "browser ", 8 ) == 0 ) ||
111  ( *it == '#' || *it == '\n' )
112  ) {
113  it.get_line();
114  } else {
115  break;
116  }
117  }
118 
119  // Read seqname, source, and feature.
120  // We use \n as stopping criterion here as well, so that in case of an error,
121  // we at least report the error in the correct line.
122  feature.seqname = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
123  utils::read_char_or_throw( it, '\t' );
124  feature.source = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
125  utils::read_char_or_throw( it, '\t' );
126  feature.feature = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
127  utils::read_char_or_throw( it, '\t' );
128 
129  // Read start and end
130  feature.start = utils::parse_unsigned_integer<size_t>( it );
131  utils::read_char_or_throw( it, '\t' );
132  feature.end = utils::parse_unsigned_integer<size_t>( it );
133  utils::read_char_or_throw( it, '\t' );
134 
135  // Read score, allowing for it to be empty
136  if( it && *it == '.' ) {
137  feature.score = std::numeric_limits<double>::quiet_NaN();
138  ++it;
139  } else {
140  feature.score = utils::parse_float<double>( it );
141  }
142  utils::read_char_or_throw( it, '\t' );
143 
144  // Read strand
145  if( !it ) {
146  throw std::runtime_error(
147  std::string("In ") + it.source_name() + ": Unexpected end of input at " + it.at()
148  );
149  }
150  feature.strand = *it;
151  ++it;
152  utils::read_char_or_throw( it, '\t' );
153 
154  // Read frame, allowing for it to be empty
155  if( it && *it == '.' ) {
156  feature.frame = -1;
157  ++it;
158  } else {
159  feature.frame = utils::parse_unsigned_integer<signed char>( it );
160  }
161 
162  // There might be no attributes, and the line might end early.
163  // Otherwise, there needs to be a tab befor the attributes.
164  if( !it || *it == '\n' ) {
165  ++it;
166  return true;
167  }
168  utils::read_char_or_throw( it, '\t' );
169 
170  // Read attributes. GFF2, GFF3, and GTF are slightly different, it seems... Bioinformatics...
171  // We just read all of them into a single string.
172  feature.attributes_group = it.get_line();
173 
174  // Old code that works for GFF3 and GTF, but not for GFF2. Kept for future refinement.
175  // GFF2 has just a simple string at the end, while GFF3 has `=` between key and value,
176  // and GTF uses space, and with a semicolon at the end. We here allow for all these variants.
177  // GFF: `hid=trf; hstart=1; hend=21`
178  // GTF: `gene_id "ENSG00000223972"; gene_name "DDX11L1";`
179  // while( it && *it != '\n' ) {
180  // // Read key
181  // utils::skip_while( it, ' ' );
182  // std::string key = utils::read_while( it, []( char c ){
183  // return c != '=' && c != ' ' && c != '\n';
184  // });
185  // utils::read_char_or_throw( it, []( char c ){ return c == '=' || c == ' '; });
186  // if( !it || *it == '\n' ) {
187  // throw std::runtime_error(
188  // std::string("In ") + it.source_name() +
189  // ": Unexpected end of line after attribute key at " + it.at()
190  // );
191  // }
192  //
193  // // Read value
194  // std::string value;
195  // assert( it );
196  // if( *it == '"' ) {
197  // value = utils::parse_quoted_string( it );
198  // } else {
199  // value = utils::read_while( it, []( char c ){ return c != ';' && c != '\n'; });
200  // }
201  //
202  // // Store key value pair
203  // feature.attributes.emplace_back( std::move( key ), std::move( value ));
204  //
205  // // The attributes end with a closing `;`, which we just want to skip
206  // if( it && *it == ';' ) {
207  // ++it;
208  // }
209  // utils::skip_while( it, ' ' );
210  // }
211  // assert( !it || *it == '\n' );
212  // ++it;
213 
214  return true;
215 }
216 
217 } // namespace population
218 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
parser.hpp
genesis::population::GffReader::Feature::feature
std::string feature
Definition: gff_reader.hpp:85
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::GffReader::Feature::frame
signed char frame
Definition: gff_reader.hpp:90
genesis::utils::InputStream::buffer
std::pair< char const *, size_t > buffer()
Direct access to the internal buffer.
Definition: input_stream.hpp:545
genesis::population::GffReader::Feature::source
std::string source
Definition: gff_reader.hpp:84
genesis::population::GffReader::Feature::start
size_t start
Definition: gff_reader.hpp:86
genesis::population::GffReader::parse_line
bool parse_line(utils::InputStream &input_stream, Feature &feature) const
Definition: gff_reader.cpp:88
genesis::population::GffReader::Feature
Definition: gff_reader.hpp:81
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:82
genesis::population::GffReader::Feature::attributes_group
std::string attributes_group
Definition: gff_reader.hpp:91
genesis::population::GffReader::Feature::seqname
std::string seqname
Definition: gff_reader.hpp:83
genesis::utils::read_char_or_throw
char read_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: scanner.hpp:299
genesis::population::GffReader::read
std::vector< Feature > read(std::shared_ptr< utils::BaseInputSource > source) const
Read a GFF2/GFF3/GTF input source, and return its content as a list of Feature structs.
Definition: gff_reader.cpp:51
logging.hpp
Provides easy and fast logging functionality.
genesis::population::merge
BaseCounts merge(BaseCounts const &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss.
Definition: population/functions/functions.cpp:372
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome, numerical_type start, numerical_type end, bool overlap=false)
Add a GenomeRegion to the list, given its chromosome, and start and end positions.
Definition: genome_region_list.hpp:130
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GffReader::Feature::end
size_t end
Definition: gff_reader.hpp:87
char.hpp
scanner.hpp
gff_reader.hpp
genesis::population::GffReader::read_as_genome_region_list
GenomeRegionList read_as_genome_region_list(std::shared_ptr< utils::BaseInputSource > source, bool merge=false) const
Read a GFF2/GFF3/GTF input source, and return its content as a GenomeRegionList.
Definition: gff_reader.cpp:63
genesis::population::GffReader::Feature::strand
char strand
Definition: gff_reader.hpp:89
genesis::population::GffReader::Feature::score
double score
Definition: gff_reader.hpp:88