A library for working with phylogenetic and population genetic data.
v0.32.0
gff_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
38 
39 #include <cassert>
40 #include <cstring>
41 #include <limits>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Reading
49 // =================================================================================================
50 
51 std::vector<GffReader::Feature> GffReader::read(
52  std::shared_ptr< utils::BaseInputSource > source
53 ) const {
54  std::vector<GffReader::Feature> result;
55  utils::InputStream it( source );
56  Feature feat;
57  while( parse_line( it, feat ) ) {
58  result.push_back( std::move( feat ));
59  }
60  return result;
61 }
62 
64  std::shared_ptr< utils::BaseInputSource > source
65 ) const {
66  GenomeLocusSet result;
67  utils::InputStream it( source );
68  Feature feat;
69  while( parse_line( it, feat ) ) {
70  result.add( feat.seqname, feat.start, feat.end );
71  }
72  return result;
73 }
74 
76  std::shared_ptr< utils::BaseInputSource > source,
77  bool merge
78 ) const {
79  GenomeRegionList result;
80  read_as_genome_region_list( source, result, merge );
81  return result;
82 }
83 
85  std::shared_ptr< utils::BaseInputSource > source,
86  GenomeRegionList& target,
87  bool merge
88 ) const {
89  utils::InputStream it( source );
90  Feature feat;
91  while( parse_line( it, feat ) ) {
92  target.add( feat.seqname, feat.start, feat.end, merge );
93  }
94 }
95 
96 // =================================================================================================
97 // Parsing
98 // =================================================================================================
99 
101  utils::InputStream& input_stream,
102  GffReader::Feature& feature
103 ) const {
104  // Shorthand.
105  auto& it = input_stream;
106  if( !it ) {
107  return false;
108  }
109 
110  // In the following, whenever we use one of the utils stream reading functions, those
111  // also check for end of stream and throw whenever their expected format is not found
112  // (e.g., when parsing a number, there has to be a number - empty/end of stream will throw).
113  // So, in all these cases, we do not need additional checks here; we hence only add checks
114  // for end of stream etc when we read the chars ourselves here (e.g., for the strand).
115 
116  // Track lines are ignored. We look for the word "track " with a space.
117  // If found, we skip the line.
118  while( true ) {
119  auto const buff = it.buffer();
120  if(
121  ( buff.second >= 6 && strncmp( buff.first, "track ", 6 ) == 0 ) ||
122  ( buff.second >= 8 && strncmp( buff.first, "browser ", 8 ) == 0 ) ||
123  ( *it == '#' || *it == '\n' )
124  ) {
125  it.get_line();
126  } else {
127  break;
128  }
129  }
130 
131  // Read seqname, source, and feature.
132  // We use \n as stopping criterion here as well, so that in case of an error,
133  // we at least report the error in the correct line.
134  feature.seqname = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
135  utils::read_char_or_throw( it, '\t' );
136  feature.source = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
137  utils::read_char_or_throw( it, '\t' );
138  feature.feature = utils::read_while( it, []( char c ){ return c != '\t' && c != '\n'; });
139  utils::read_char_or_throw( it, '\t' );
140 
141  // Read start and end
142  feature.start = utils::parse_unsigned_integer<size_t>( it );
143  utils::read_char_or_throw( it, '\t' );
144  feature.end = utils::parse_unsigned_integer<size_t>( it );
145  utils::read_char_or_throw( it, '\t' );
146 
147  // Read score, allowing for it to be empty
148  if( it && *it == '.' ) {
149  feature.score = std::numeric_limits<double>::quiet_NaN();
150  ++it;
151  } else {
152  feature.score = utils::parse_float<double>( it );
153  }
154  utils::read_char_or_throw( it, '\t' );
155 
156  // Read strand
157  if( !it ) {
158  throw std::runtime_error(
159  std::string("In ") + it.source_name() + ": Unexpected end of input at " + it.at()
160  );
161  }
162  feature.strand = *it;
163  ++it;
164  utils::read_char_or_throw( it, '\t' );
165 
166  // Read frame, allowing for it to be empty
167  if( it && *it == '.' ) {
168  feature.frame = -1;
169  ++it;
170  } else {
171  feature.frame = utils::parse_unsigned_integer<signed char>( it );
172  }
173 
174  // There might be no attributes, and the line might end early.
175  // Otherwise, there needs to be a tab befor the attributes.
176  if( !it || *it == '\n' ) {
177  ++it;
178  return true;
179  }
180  utils::read_char_or_throw( it, '\t' );
181 
182  // Read attributes. GFF2, GFF3, and GTF are slightly different, it seems... Bioinformatics...
183  // We just read all of them into a single string.
184  feature.attributes_group = it.get_line();
185 
186  // Old code that works for GFF3 and GTF, but not for GFF2. Kept for future refinement.
187  // GFF2 has just a simple string at the end, while GFF3 has `=` between key and value,
188  // and GTF uses space, and with a semicolon at the end. We here allow for all these variants.
189  // GFF: `hid=trf; hstart=1; hend=21`
190  // GTF: `gene_id "ENSG00000223972"; gene_name "DDX11L1";`
191  // while( it && *it != '\n' ) {
192  // // Read key
193  // utils::skip_while( it, ' ' );
194  // std::string key = utils::read_while( it, []( char c ){
195  // return c != '=' && c != ' ' && c != '\n';
196  // });
197  // utils::read_char_or_throw( it, []( char c ){ return c == '=' || c == ' '; });
198  // if( !it || *it == '\n' ) {
199  // throw std::runtime_error(
200  // std::string("In ") + it.source_name() +
201  // ": Unexpected end of line after attribute key at " + it.at()
202  // );
203  // }
204  //
205  // // Read value
206  // std::string value;
207  // assert( it );
208  // if( *it == '"' ) {
209  // value = utils::parse_quoted_string( it );
210  // } else {
211  // value = utils::read_while( it, []( char c ){ return c != ';' && c != '\n'; });
212  // }
213  //
214  // // Store key value pair
215  // feature.attributes.emplace_back( std::move( key ), std::move( value ));
216  //
217  // // The attributes end with a closing `;`, which we just want to skip
218  // if( it && *it == ';' ) {
219  // ++it;
220  // }
221  // utils::skip_while( it, ' ' );
222  // }
223  // assert( !it || *it == '\n' );
224  // ++it;
225 
226  return true;
227 }
228 
229 } // namespace population
230 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
parser.hpp
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
genesis::population::GffReader::Feature::feature
std::string feature
Definition: gff_reader.hpp:86
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
genesis::population::GffReader::Feature::frame
signed char frame
Definition: gff_reader.hpp:91
genesis::utils::InputStream::buffer
std::pair< char const *, size_t > buffer()
Direct access to the internal buffer.
Definition: input_stream.hpp:390
genesis::population::GffReader::Feature::source
std::string source
Definition: gff_reader.hpp:85
genesis::population::GffReader::Feature::start
size_t start
Definition: gff_reader.hpp:87
genesis::population::GffReader::parse_line
bool parse_line(utils::InputStream &input_stream, Feature &feature) const
Definition: gff_reader.cpp:100
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::population::GffReader::Feature
Definition: gff_reader.hpp:82
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::GffReader::Feature::attributes_group
std::string attributes_group
Definition: gff_reader.hpp:92
genesis::population::GffReader::Feature::seqname
std::string seqname
Definition: gff_reader.hpp:84
genesis::utils::read_char_or_throw
char read_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: scanner.hpp:299
genesis::population::GffReader::read
std::vector< Feature > read(std::shared_ptr< utils::BaseInputSource > source) const
Read a GFF2/GFF3/GTF input source, and return its content as a list of Feature structs.
Definition: gff_reader.cpp:51
logging.hpp
Provides easy and fast logging functionality.
genesis::population::GffReader::read_as_genome_locus_set
GenomeLocusSet read_as_genome_locus_set(std::shared_ptr< utils::BaseInputSource > source) const
Read an input source, and return its content as a GenomeLocusSet.
Definition: gff_reader.cpp:63
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GffReader::Feature::end
size_t end
Definition: gff_reader.hpp:88
char.hpp
scanner.hpp
gff_reader.hpp
genesis::population::GffReader::read_as_genome_region_list
GenomeRegionList read_as_genome_region_list(std::shared_ptr< utils::BaseInputSource > source, bool merge=false) const
Read a GFF2/GFF3/GTF input source, and return its content as a GenomeRegionList.
Definition: gff_reader.cpp:75
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127
genesis::population::GffReader::Feature::strand
char strand
Definition: gff_reader.hpp:90
genesis::population::GffReader::Feature::score
double score
Definition: gff_reader.hpp:89