A library for working with phylogenetic and population genetic data.
v0.27.0
bed_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
37 
38 #include <cassert>
39 #include <limits>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace population {
44 
45 // =================================================================================================
46 // Reading
47 // =================================================================================================
48 
49 std::vector<BedReader::Feature> BedReader::read(
50  std::shared_ptr< utils::BaseInputSource > source
51 ) const {
52  std::vector<BedReader::Feature> result;
53  read_( source, [&]( Feature&& feat ){
54  result.push_back( std::move( feat ));
55  });
56  return result;
57 }
58 
60  std::shared_ptr< utils::BaseInputSource > source,
61  bool merge
62 ) const {
63  GenomeRegionList result;
64  read_as_genome_region_list( source, result, merge );
65  return result;
66 }
67 
69  std::shared_ptr< utils::BaseInputSource > source,
70  GenomeRegionList& target,
71  bool merge
72 ) const {
73  read_( source, [&]( Feature&& feat ){
74  target.add( feat.chrom, feat.chrom_start, feat.chrom_end, merge );
75  });
76 }
77 
78 // =================================================================================================
79 // Internal Helpers
80 // =================================================================================================
81 
82 void BedReader::read_(
83  std::shared_ptr< utils::BaseInputSource > source,
84  std::function<void(Feature&&)> callback
85 ) const {
86  utils::InputStream it( source );
87 
88  // We use an internal reading function that takes care of checking that the number of columns
89  // is constant throughout the input file. This avoids code duplication for these checkes
90  // in the two public read functions.
91 
92  Feature feat;
93  size_t expected_columns = 0;
94  size_t found_columns;
95  while(( found_columns = parse_line_( it, feat ) )) {
96  if( expected_columns == 0 ) {
97  expected_columns = found_columns;
98  } else if( expected_columns != found_columns ) {
99  // Called with the stream at the next line already. Need to compensate for this.
100  assert( it.line() > 0 );
101  throw std::runtime_error(
102  "Inconsistent number of columns in BED input. Expected " +
103  std::to_string( expected_columns ) + " based on first row, but found " +
104  std::to_string( found_columns ) + " in line " + std::to_string( it.line() - 1 )
105  );
106  }
107  assert( found_columns == expected_columns );
108  callback( std::move( feat ));
109  feat = Feature{};
110  }
111 }
112 
113 size_t BedReader::parse_line_(
114  utils::InputStream& input_stream,
115  BedReader::Feature& feature
116 ) const {
117  // Setup.
118  size_t found_columns = 0;
119  auto& it = input_stream;
120  if( !it ) {
121  return found_columns;
122  }
123 
124  // The BED format unfortunately does not have a proper separation of the header lines,
125  // and instead just depends on non-standard keywords in the beginning of lines...
126  // Bit weird to parse, but okay. Also, it is unclear if `track` can appear anywhere in a file
127  // or just in the beginning... So, we just allow for all of that, and test for such comments
128  // in every line.
129  bool is_comment = true;
130  std::string first_word;
131  while( is_comment ) {
132  first_word = parse_string_( it );
133  if(
134  first_word == "browser" ||
135  first_word == "track" ||
136  utils::starts_with( first_word, "#" )
137  ) {
138  // Read until the end of the comment line; we just ignore it.
139  it.get_line();
140  } else {
141  is_comment = false;
142  }
143  }
144  if( !it ) {
145  // Comments at the end of the file. Just finish here.
146  return found_columns;
147  }
148 
149  // Read chrom and start and end coordinates. These are the mandatory ones in bed.
150  // Chrom will simply be the first entry after the header.
151  feature.chrom = first_word;
152  if( ! next_field_( it, found_columns )) {
153  throw std::runtime_error(
154  "BED input expected to have three mandatory columns chrom,start,end in the beginning "
155  "of the line, but only chrom was found at " + it.at()
156  );
157  }
158  feature.chrom_start = utils::parse_unsigned_integer<size_t>( it ) + 1;
159  if( ! next_field_( it, found_columns )) {
160  throw std::runtime_error(
161  "BED input expected to have three mandatory columns chrom,start,end in the beginning "
162  "of the line, but only chrom and start were found at " + it.at()
163  );
164  }
165  feature.chrom_end = utils::parse_unsigned_integer<size_t>( it );
166 
167  // From now on we need to check before every field if there is more data...
168 
169  // name
170  if( ! next_field_( it, found_columns )) {
171  return found_columns;
172  }
173  feature.name = parse_string_( it );
174 
175  // score
176  if( ! next_field_( it, found_columns )) {
177  return found_columns;
178  }
179  feature.score = utils::parse_unsigned_integer<size_t>( it );
180  if( feature.score > 1000 ) {
181  throw std::runtime_error( "Invalid score > 1000 in BED input at " + it.at() );
182  }
183 
184  // strand
185  if( ! next_field_( it, found_columns )) {
186  return found_columns;
187  }
188  feature.strand = utils::read_char_or_throw( input_stream, []( char c ){
189  return c == '+' || c == '-' || c == '.';
190  });
191 
192  // thick_start. Need to adjust for 0-based again.
193  if( ! next_field_( it, found_columns )) {
194  return found_columns;
195  }
196  feature.thick_start = utils::parse_unsigned_integer<size_t>( it ) + 1;
197 
198  // thick_end
199  if( ! next_field_( it, found_columns )) {
200  return found_columns;
201  }
202  feature.thick_end = utils::parse_unsigned_integer<size_t>( it );
203 
204  // item_rgb
205  if( ! next_field_( it, found_columns )) {
206  return found_columns;
207  }
208  feature.item_rgb = parse_string_( it );
209 
210  // block_count
211  if( ! next_field_( it, found_columns )) {
212  return found_columns;
213  }
214  feature.block_count = utils::parse_unsigned_integer<size_t>( it );
215 
216  // block_sizes
217  if( ! next_field_( it, found_columns )) {
218  return found_columns;
219  }
220  auto block_sizes = utils::split( parse_string_( it ), "," );
221  for( auto const& bs : block_sizes ) {
222  if( ! bs.empty() ) {
223  feature.block_sizes.push_back( stoull( bs ));
224  }
225  }
226  if( feature.block_sizes.size() != feature.block_count ) {
227  throw std::runtime_error(
228  "Invalid blockSizes length in BED input. Expected " +
229  std::to_string( feature.block_count ) + " based on blockCount, but found " +
230  std::to_string( feature.block_sizes.size() ) + " values instead, at " + it.at()
231  );
232  }
233 
234  // block_starts
235  if( ! next_field_( it, found_columns )) {
236  return found_columns;
237  }
238  auto block_starts = utils::split( parse_string_( it ), "," );
239  for( auto const& bs : block_sizes ) {
240  if( ! bs.empty() ) {
241  feature.block_starts.push_back( stoull( bs ));
242  }
243  }
244  if( feature.block_starts.size() != feature.block_count ) {
245  throw std::runtime_error(
246  "Invalid blockStarts length in BED input. Expected " +
247  std::to_string( feature.block_count ) + " based on blockCount, but found " +
248  std::to_string( feature.block_starts.size() ) + " values instead, at " + it.at()
249  );
250  }
251 
252  // all remaining (unsupported, but ignored) columns
253  while( next_field_( it, found_columns ) ) {
254  parse_string_( it );
255  }
256 
257  // next_field_() already takes care of jumping to the next line, if there is one.
258  // So, nothing more to do here.
259  return found_columns;
260 }
261 
262 bool BedReader::next_field_( utils::InputStream& input_stream, size_t& found_columns ) const
263 {
264  // Throws for anything that is not a new line or field delimiter (tab or space).
265  // We expect at least one delimiter, and then skip any more. It is not clear from
266  // the format if that is the right way, or if empty fields are possible, but that
267  // seems how it is supposed to be...
268  // Returns true iff a next field was delimited and starts there.
269  if( ! input_stream || *input_stream == '\n' ) {
270  assert( !input_stream || *input_stream == '\n' );
271  ++input_stream;
272  return false;
273  }
274  assert( input_stream && *input_stream != '\n' );
275  utils::read_char_or_throw( input_stream, []( char c ){
276  return c == '\t' || c == ' ';
277  });
278  utils::skip_while( input_stream, []( char c ){
279  return c == '\t' || c == ' ';
280  });
281  if( ! input_stream || *input_stream == '\n' ) {
282  throw std::runtime_error( "Unexpected end of BED input at " + input_stream.at() );
283  }
284  ++found_columns;
285  return true;
286 }
287 
288 std::string BedReader::parse_string_( utils::InputStream& input_stream ) const
289 {
290  // We use \n as stopping criterion here as well, so that in case of an error,
291  // we at least report the error in the correct line.
292  return utils::read_while( input_stream, []( char c ){
293  return c != '\t' && c != ' ' && c != '\n';
294  });
295 }
296 
297 
298 } // namespace population
299 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
parser.hpp
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
bed_reader.hpp
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:82
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
string.hpp
Provides some commonly used string utility functions.
genesis::population::BedReader::Feature
Store all values that can typically appear in the columns of a BED file.
Definition: bed_reader.hpp:105
genesis::utils::read_char_or_throw
char read_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: scanner.hpp:299
genesis::utils::starts_with
bool starts_with(std::string const &text, std::string const &start)
Return whether a string starts with another string.
Definition: string.cpp:81
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
genesis::population::merge
BaseCounts merge(BaseCounts const &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss.
Definition: population/functions/functions.cpp:372
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome, numerical_type start, numerical_type end, bool overlap=false)
Add a GenomeRegion to the list, given its chromosome, and start and end positions.
Definition: genome_region_list.hpp:130
genesis::population::BedReader::read
std::vector< Feature > read(std::shared_ptr< utils::BaseInputSource > source) const
Read a BED input source, and return its content as a list of Feature structs.
Definition: bed_reader.cpp:49
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
char.hpp
scanner.hpp
genesis::population::BedReader::read_as_genome_region_list
GenomeRegionList read_as_genome_region_list(std::shared_ptr< utils::BaseInputSource > source, bool merge=false) const
Read a BED input source, and return its content as a GenomeRegionList.
Definition: bed_reader.cpp:59
genesis::utils::split
std::vector< std::string > split(std::string const &str, std::string const &delimiters, const bool trim_empty)
Spilt a string into parts, given a delimiters set of chars.
Definition: string.cpp:386