A library for working with phylogenetic and population genetic data.
v0.32.0
bed_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
37 
38 #include <cassert>
39 #include <limits>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace population {
44 
45 // =================================================================================================
46 // Reading
47 // =================================================================================================
48 
49 std::vector<BedReader::Feature> BedReader::read(
50  std::shared_ptr< utils::BaseInputSource > source
51 ) const {
52  std::vector<BedReader::Feature> result;
53  read_( source, [&]( Feature&& feat ){
54  result.push_back( std::move( feat ));
55  });
56  return result;
57 }
58 
60  std::shared_ptr< utils::BaseInputSource > source
61 ) const {
62  GenomeLocusSet result;
63  read_( source, [&]( Feature&& feat ){
64  result.add( feat.chrom, feat.chrom_start, feat.chrom_end );
65  });
66  return result;
67 }
68 
70  std::shared_ptr< utils::BaseInputSource > source,
71  bool merge
72 ) const {
73  GenomeRegionList result;
74  read_as_genome_region_list( source, result, merge );
75  return result;
76 }
77 
79  std::shared_ptr< utils::BaseInputSource > source,
80  GenomeRegionList& target,
81  bool merge
82 ) const {
83  read_( source, [&]( Feature&& feat ){
84  target.add( feat.chrom, feat.chrom_start, feat.chrom_end, merge );
85  });
86 }
87 
88 // =================================================================================================
89 // Internal Helpers
90 // =================================================================================================
91 
92 void BedReader::read_(
93  std::shared_ptr< utils::BaseInputSource > source,
94  std::function<void(Feature&&)> callback
95 ) const {
96  utils::InputStream it( source );
97 
98  // We use an internal reading function that takes care of checking that the number of columns
99  // is constant throughout the input file. This avoids code duplication for these checkes
100  // in the two public read functions.
101 
102  Feature feat;
103  size_t expected_columns = 0;
104  size_t found_columns;
105  while(( found_columns = parse_line_( it, feat ) )) {
106  if( expected_columns == 0 ) {
107  expected_columns = found_columns;
108  } else if( expected_columns != found_columns ) {
109  // Called with the stream at the next line already. Need to compensate for this.
110  assert( it.line() > 0 );
111  throw std::runtime_error(
112  "Inconsistent number of columns in BED input. Expected " +
113  std::to_string( expected_columns ) + " based on first row, but found " +
114  std::to_string( found_columns ) + " in line " + std::to_string( it.line() - 1 )
115  );
116  }
117  assert( found_columns == expected_columns );
118  callback( std::move( feat ));
119  feat = Feature();
120  }
121 }
122 
123 size_t BedReader::parse_line_(
124  utils::InputStream& input_stream,
125  BedReader::Feature& feature
126 ) const {
127  // Setup.
128  size_t found_columns = 0;
129  auto& it = input_stream;
130  if( !it ) {
131  return found_columns;
132  }
133 
134  // The BED format unfortunately does not have a proper separation of the header lines,
135  // and instead just depends on non-standard keywords in the beginning of lines...
136  // Bit weird to parse, but okay. Also, it is unclear if `track` can appear anywhere in a file
137  // or just in the beginning... So, we just allow for all of that, and test for such comments
138  // in every line.
139  bool is_comment = true;
140  std::string first_word;
141  while( is_comment ) {
142  first_word = parse_string_( it );
143  if(
144  first_word == "browser" ||
145  first_word == "track" ||
146  utils::starts_with( first_word, "#" )
147  ) {
148  // Read until the end of the comment line; we just ignore it.
149  it.get_line();
150  } else {
151  is_comment = false;
152  }
153  }
154  if( !it ) {
155  // Comments at the end of the file. Just finish here.
156  return found_columns;
157  }
158 
159  // Read chrom and start and end coordinates. These are the mandatory ones in bed.
160  // Chrom will simply be the first entry after the header.
161  feature.chrom = first_word;
162  if( ! next_field_( it, found_columns )) {
163  throw std::runtime_error(
164  "BED input expected to have three mandatory columns chrom,start,end in the beginning "
165  "of the line, but only chrom was found at " + it.at()
166  );
167  }
168  feature.chrom_start = utils::parse_unsigned_integer<size_t>( it ) + 1;
169  if( ! next_field_( it, found_columns )) {
170  throw std::runtime_error(
171  "BED input expected to have three mandatory columns chrom,start,end in the beginning "
172  "of the line, but only chrom and start were found at " + it.at()
173  );
174  }
175  feature.chrom_end = utils::parse_unsigned_integer<size_t>( it );
176 
177  // From now on we need to check before every field if there is more data...
178 
179  // name
180  if( ! next_field_( it, found_columns )) {
181  return found_columns;
182  }
183  feature.name = parse_string_( it );
184 
185  // score
186  if( ! next_field_( it, found_columns )) {
187  return found_columns;
188  }
189  feature.score = utils::parse_unsigned_integer<size_t>( it );
190  if( feature.score > 1000 ) {
191  throw std::runtime_error( "Invalid score > 1000 in BED input at " + it.at() );
192  }
193 
194  // strand
195  if( ! next_field_( it, found_columns )) {
196  return found_columns;
197  }
198  feature.strand = utils::read_char_or_throw( input_stream, []( char c ){
199  return c == '+' || c == '-' || c == '.';
200  });
201 
202  // thick_start. Need to adjust for 0-based again.
203  if( ! next_field_( it, found_columns )) {
204  return found_columns;
205  }
206  feature.thick_start = utils::parse_unsigned_integer<size_t>( it ) + 1;
207 
208  // thick_end
209  if( ! next_field_( it, found_columns )) {
210  return found_columns;
211  }
212  feature.thick_end = utils::parse_unsigned_integer<size_t>( it );
213 
214  // item_rgb
215  if( ! next_field_( it, found_columns )) {
216  return found_columns;
217  }
218  feature.item_rgb = parse_string_( it );
219 
220  // block_count
221  if( ! next_field_( it, found_columns )) {
222  return found_columns;
223  }
224  feature.block_count = utils::parse_unsigned_integer<size_t>( it );
225 
226  // block_sizes
227  if( ! next_field_( it, found_columns )) {
228  return found_columns;
229  }
230  auto block_sizes = utils::split( parse_string_( it ), "," );
231  for( auto const& bs : block_sizes ) {
232  if( ! bs.empty() ) {
233  feature.block_sizes.push_back( stoull( bs ));
234  }
235  }
236  if( feature.block_sizes.size() != feature.block_count ) {
237  throw std::runtime_error(
238  "Invalid blockSizes length in BED input. Expected " +
239  std::to_string( feature.block_count ) + " based on blockCount, but found " +
240  std::to_string( feature.block_sizes.size() ) + " values instead, at " + it.at()
241  );
242  }
243 
244  // block_starts
245  if( ! next_field_( it, found_columns )) {
246  return found_columns;
247  }
248  auto block_starts = utils::split( parse_string_( it ), "," );
249  for( auto const& bs : block_sizes ) {
250  if( ! bs.empty() ) {
251  feature.block_starts.push_back( stoull( bs ));
252  }
253  }
254  if( feature.block_starts.size() != feature.block_count ) {
255  throw std::runtime_error(
256  "Invalid blockStarts length in BED input. Expected " +
257  std::to_string( feature.block_count ) + " based on blockCount, but found " +
258  std::to_string( feature.block_starts.size() ) + " values instead, at " + it.at()
259  );
260  }
261 
262  // all remaining (unsupported, but ignored) columns
263  while( next_field_( it, found_columns ) ) {
264  parse_string_( it );
265  }
266 
267  // next_field_() already takes care of jumping to the next line, if there is one.
268  // So, nothing more to do here.
269  return found_columns;
270 }
271 
272 bool BedReader::next_field_( utils::InputStream& input_stream, size_t& found_columns ) const
273 {
274  // Throws for anything that is not a new line or field delimiter (tab or space).
275  // We expect at least one delimiter, and then skip any more. It is not clear from
276  // the format if that is the right way, or if empty fields are possible, but that
277  // seems how it is supposed to be...
278  // Returns true iff a next field was delimited and starts there.
279  if( ! input_stream || *input_stream == '\n' ) {
280  assert( !input_stream || *input_stream == '\n' );
281  ++input_stream;
282  return false;
283  }
284  assert( input_stream && *input_stream != '\n' );
285  utils::read_char_or_throw( input_stream, []( char c ){
286  return c == '\t' || c == ' ';
287  });
288  utils::skip_while( input_stream, []( char c ){
289  return c == '\t' || c == ' ';
290  });
291  if( ! input_stream || *input_stream == '\n' ) {
292  throw std::runtime_error( "Unexpected end of BED input at " + input_stream.at() );
293  }
294  ++found_columns;
295  return true;
296 }
297 
298 std::string BedReader::parse_string_( utils::InputStream& input_stream ) const
299 {
300  // We use \n as stopping criterion here as well, so that in case of an error,
301  // we at least report the error in the correct line.
302  return utils::read_while( input_stream, []( char c ){
303  return c != '\t' && c != ' ' && c != '\n';
304  });
305 }
306 
307 } // namespace population
308 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
parser.hpp
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
bed_reader.hpp
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::utils::starts_with
bool starts_with(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix.
Definition: string.cpp:136
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
genesis::population::BedReader::Feature
Store all values that can typically appear in the columns of a BED file.
Definition: bed_reader.hpp:106
genesis::utils::read_char_or_throw
char read_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: scanner.hpp:299
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
genesis::population::BedReader::read
std::vector< Feature > read(std::shared_ptr< utils::BaseInputSource > source) const
Read a BED input source, and return its content as a list of Feature structs.
Definition: bed_reader.cpp:49
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
char.hpp
genesis::population::BedReader::read_as_genome_locus_set
GenomeLocusSet read_as_genome_locus_set(std::shared_ptr< utils::BaseInputSource > source) const
Read an input source, and return its content as a GenomeLocusSet.
Definition: bed_reader.cpp:59
scanner.hpp
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127
genesis::population::BedReader::read_as_genome_region_list
GenomeRegionList read_as_genome_region_list(std::shared_ptr< utils::BaseInputSource > source, bool merge=false) const
Read a BED input source, and return its content as a GenomeRegionList.
Definition: bed_reader.cpp:69