A library for working with phylogenetic and population genetic data.
v0.32.0
map_bim_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
38 
39 #include <cassert>
40 #include <limits>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Reading
48 // =================================================================================================
49 
50 std::vector<MapBimReader::Feature> MapBimReader::read(
51  std::shared_ptr< utils::BaseInputSource > source
52 ) const {
53  std::vector<MapBimReader::Feature> result;
54  read_( source, [&]( Feature&& feat ){
55  result.push_back( std::move( feat ));
56  });
57  return result;
58 }
59 
61  std::shared_ptr< utils::BaseInputSource > source
62 ) const {
63  GenomeLocusSet result;
64  read_( source, [&]( Feature&& feat ){
65  result.add( feat.chromosome, feat.coordinate, feat.coordinate );
66  });
67  return result;
68 }
69 
71  std::shared_ptr< utils::BaseInputSource > source,
72  bool merge
73 ) const {
74  GenomeRegionList result;
75  read_as_genome_region_list( source, result, merge );
76  return result;
77 }
78 
80  std::shared_ptr< utils::BaseInputSource > source,
81  GenomeRegionList& target,
82  bool merge
83 ) const {
84  read_( source, [&]( Feature&& feat ){
85  target.add( feat.chromosome, feat.coordinate, feat.coordinate, merge );
86  });
87 }
88 
89 // =================================================================================================
90 // Internal Helpers
91 // =================================================================================================
92 
93 void MapBimReader::read_(
94  std::shared_ptr< utils::BaseInputSource > source,
95  std::function<void(Feature&&)> callback
96 ) const {
97  utils::InputStream it( source );
98 
99  // We use an internal reading function that takes care of checking that the number of columns
100  // is constant throughout the input file. This avoids code duplication for these checkes
101  // in the two public read functions. However, this is kind of code duplication from the bed
102  // reader... not much to do about this right now. The formats are distinct enough that
103  // refactoring this into some common reader functionality is not worth it at this time.
104 
105  Feature feat;
106  std::vector<std::string> buffer;
107  size_t expected_columns = 0;
108  size_t found_columns;
109  while(( found_columns = parse_line_( it, feat, buffer ) )) {
110  if( expected_columns == 0 ) {
111  expected_columns = found_columns;
112  } else if( expected_columns != found_columns ) {
113  // Called with the stream at the next line already. Need to compensate for this.
114  assert( it.line() > 0 );
115  throw std::runtime_error(
116  "Inconsistent number of columns in map/bim input. Expected " +
117  std::to_string( expected_columns ) + " based on first row, but found " +
118  std::to_string( found_columns ) + " in line " + std::to_string( it.line() - 1 )
119  );
120  }
121  assert( found_columns == expected_columns );
122 
123  // Throw on coordinate 0 as a weird special case.
124  if( feat.coordinate == 0 ) {
125  // Same line number adjustment as above.
126  assert( it.line() > 0 );
127  throw std::runtime_error(
128  "Invalid base-pair coordinate 0 in map/bim input in line " +
129  std::to_string( it.line() - 1 )
130  );
131  }
132 
133  // Here we apply the skip_negative_coordinates_ setting.
134  if( !( feat.coordinate < 0 && skip_negative_coordinates_ )) {
135  callback( std::move( feat ));
136  }
137  feat = Feature();
138  }
139 }
140 
141 size_t MapBimReader::parse_line_(
142  utils::InputStream& input_stream,
143  MapBimReader::Feature& feature,
144  std::vector<std::string>& buffer
145 ) const {
146  // Setup.
147  size_t found_columns = 0;
148  auto& it = input_stream;
149  if( !it ) {
150  return found_columns;
151  }
152 
153  // Unfortunately, the format has an optional column in the middle, so we need to know the final
154  // number of coumns first before we can start parsing them into specific data types...
155  // So, first, we need to parse all values of the line into the buffer.
156  while( it && *it != '\n' ) {
157  // Make sure that the buffer is large enough.
158  // This does multiple consecutive resizes for the first line, when we do not yet know
159  // how many columns there are in the file, but then none after
160  // (unless there is a mismatch in number of columns, which is an exception anyway).
161  if( buffer.size() < found_columns + 1 ) {
162  buffer.resize( found_columns + 1 );
163  }
164 
165  // Read into the buffer until the next tab or end of line.
166  buffer[found_columns] = utils::read_while( it, []( char c ){
167  return c != '\t' && c != '\n';
168  });
169  if( buffer[found_columns].empty() ) {
170  throw std::runtime_error( "Invalid empty entry of map/bim input at " + it.at() );
171  }
172  ++found_columns;
173 
174  // Skip the separator, if there is one; in that case, we also expect that data follows.
175  if( it && *it == '\t' ) {
176  ++it;
177  if( ! it || *it == '\n' ) {
178  throw std::runtime_error( "Unexpected end of map/bim input at " + it.at() );
179  }
180  }
181  }
182  assert( found_columns <= buffer.size() );
183 
184  // We are done with the line, move to the next, if there is one.
185  if( it ) {
186  assert( *it == '\n' );
187  ++it;
188  }
189 
190  // If we have fewer columns that the buffer, that means the buffer was resized in a
191  // previous iteration to some larger value, which means that the current line is too short.
192  // We can skip the parsing here, as this will lead to an exception in read_() anyway.
193  if( found_columns < buffer.size() ) {
194  return found_columns;
195  }
196  assert( found_columns == buffer.size() );
197 
198  // Validity check.
199  if( found_columns < 3 || found_columns > 6 ) {
200  throw std::runtime_error(
201  "Invalid number of columns (" + std::to_string(found_columns) +
202  " found, but 3-6 expected) of map/bim input at " + it.at()
203  );
204  }
205 
206  // The first two columns are always there. We already checked above that there are 3-6 entries
207  // in the buffer, so we only assert here.
208  assert( found_columns > 2 );
209  assert( buffer.size() > 2 );
210  assert( found_columns == buffer.size() );
211  feature.chromosome = buffer[0];
212  feature.variant_id = buffer[1];
213 
214  // Helper functions to avoid code repetition for the value parsing.
215  auto get_position_ = [&]( std::string const& value ){
216  double result;
217  if( ! utils::convert_to_double( value, result )) {
218  throw std::runtime_error(
219  "Invalid map/bim input with (centi)morgan position that is not a numeric value (\"" +
220  value + "\") at " + it.at()
221  );
222  }
223  return result;
224  };
225  auto get_coordinate_ = [&]( std::string const& value ){
226  long long result;
227  if( ! utils::convert_to_signed_integer( value, result )) {
228  throw std::runtime_error(
229  "Invalid map/bim input with base pair coordinate that is not a numeric value (\"" +
230  value + "\") at " + it.at()
231  );
232  }
233  return result;
234  };
235  auto get_allele_ = [&]( std::string const& value ){
236  if( value.size() != 1 ) {
237  throw std::runtime_error(
238  "Invalid map/bim input with allele that is not a single char (\"" +
239  value + "\") at " + it.at()
240  );
241  }
242  assert( value.size() == 1 );
243  return value[0];
244  };
245 
246  // Now we know how many columns we have read, and can parse them adequately.
247  // We only do the variable columns here, and the first two, that are always present, later.
248  assert( found_columns == buffer.size() );
249  assert( found_columns >= 3 && found_columns <= 6 );
250  switch( found_columns ) {
251  case 3: {
252  // map file without (centi)morgan position, just coordinate.
253  feature.coordinate = get_coordinate_( buffer[2] );
254  break;
255  }
256  case 4: {
257  // map file with (centi)morgan position.
258  feature.position = get_position_( buffer[2] );
259  feature.coordinate = get_coordinate_( buffer[3] );
260  break;
261  }
262  case 5: {
263  // bim file without (centi)morgan position, just coordinate, and alleles.
264  feature.coordinate = get_coordinate_( buffer[2] );
265  feature.allele_1 = get_allele_( buffer[3] );
266  feature.allele_2 = get_allele_( buffer[4] );
267  break;
268  }
269  case 6: {
270  // bim file with (centi)morgan position.
271  feature.position = get_position_( buffer[2] );
272  feature.coordinate = get_coordinate_( buffer[3] );
273  feature.allele_1 = get_allele_( buffer[4] );
274  feature.allele_2 = get_allele_( buffer[5] );
275  break;
276  }
277  default: {
278  // Cannot happen, we checked already.
279  assert( false );
280  }
281  }
282 
283  return found_columns;
284 }
285 
286 } // namespace population
287 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::population::MapBimReader::read_as_genome_locus_set
GenomeLocusSet read_as_genome_locus_set(std::shared_ptr< utils::BaseInputSource > source) const
Read an input source, and return its content as a GenomeLocusSet.
Definition: map_bim_reader.cpp:60
parser.hpp
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
genesis::population::MapBimReader::read_as_genome_region_list
GenomeRegionList read_as_genome_region_list(std::shared_ptr< utils::BaseInputSource > source, bool merge=true) const
Read a map/bim input source, and return its content as a GenomeRegionList.
Definition: map_bim_reader.cpp:70
genesis::population::MapBimReader::read
std::vector< Feature > read(std::shared_ptr< utils::BaseInputSource > source) const
Read a map/bim input source, and return its content as a list of Feature structs.
Definition: map_bim_reader.cpp:50
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::population::MapBimReader::Feature
Store all values that can typically appear in the columns of a map/bim file.
Definition: map_bim_reader.hpp:109
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::convert_to_double
void convert_to_double(Dataframe &df, size_t col_index)
Definition: utils/containers/dataframe/operators.cpp:192
string.hpp
Provides some commonly used string utility functions.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
convert.hpp
char.hpp
scanner.hpp
map_bim_reader.hpp
genesis::utils::convert_to_signed_integer
bool convert_to_signed_integer(std::string const &str, long long &result)
Convert a string to signed integer, store the result in result, and return whether the conversion as ...
Definition: convert.cpp:182
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127