A library for working with phylogenetic and population genetic data.
v0.32.0
genome_region_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
38 
39 #include <cassert>
40 #include <limits>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Reading
48 // =================================================================================================
49 
51  std::shared_ptr< utils::BaseInputSource > source
52 ) const {
53  GenomeLocusSet result;
54  read_( source, [&]( GenomeRegion const& region ){
55  result.add( region );
56  });
57  return result;
58 }
59 
61  std::shared_ptr< utils::BaseInputSource > source,
62  bool merge
63 ) const {
64  GenomeRegionList result;
65  read_as_genome_region_list( source, result, merge );
66  return result;
67 }
68 
70  std::shared_ptr< utils::BaseInputSource > source,
71  GenomeRegionList& target,
72  bool merge
73 ) const {
74  read_( source, [&]( GenomeRegion const& region ){
75  target.add( region, merge );
76  });
77 }
78 
79 // =================================================================================================
80 // Internal Helpers
81 // =================================================================================================
82 
83 void GenomeRegionReader::read_(
84  std::shared_ptr< utils::BaseInputSource > source,
85  std::function<void( GenomeRegion const& region )> callback
86 ) const {
87  using namespace genesis::utils;
88  InputStream it( source );
89 
90  // Helper function to throw on error without copies of the same error message each time.
91  auto throw_invalid_arg_ = [&]( InputStream const& it ){
92  throw std::invalid_argument(
93  "Invalid genomic region in " + it.source_name() + " at " + it.at()
94  );
95  };
96 
97  // Read the file; each loop iteration handles one line.
98  while( it ) {
99  GenomeRegion region;
100 
101  // Read the chromosome until the first delimiter is found.
102  // The is_print function is false for tabs, but then we want to stop anyway, so that's okay.
103  region.chromosome = read_while( it, []( char c ){
104  return is_print(c) && c != ':' && c != ' ' && c != '\t' && c != '\n';
105  });
106  if( region.chromosome.empty() || region.chromosome == "-" || region.chromosome == ".." ) {
107  throw_invalid_arg_( it );
108  }
109 
110  // We only allow certain combinations of delimiters, `:` with `-` or `..`,
111  // or white space, but not mixed. So here, we take note of which delimiter we found first.
112  bool colon = true;
113  if( !it || *it == '\n' ) {
114  // No information on positions, so mark it as using the whole chromosome.
115  region.start = 0;
116  region.end = 0;
117  callback( region );
118 
119  // End of the line, move to next. Also works on past the end, no special case needed.
120  ++it;
121  continue;
122  } else if( *it == ':' ) {
123  colon = true;
124  } else if( *it == ' ' || *it == '\t' ) {
125  colon = false;
126  } else {
127  throw_invalid_arg_( it );
128  }
129  ++it;
130 
131  // Now we expect to read the start position. Throws when there is no number.
132  region.start = parse_unsigned_integer<size_t>( it );
133 
134  if( !it || *it == '\n' ) {
135  // We are at the end of the line already - no end position specified.
136  // That means, it's a single position.
137  region.end = region.start;
138  } else {
139  // End position is specified.
140  // Check that the correct delimiters are present, and skip them.
141  if( *it == '-' || *it == '.' ) {
142  if( ! colon ) {
143  throw_invalid_arg_( it );
144  }
145  if( *it == '.' ) {
146  ++it;
147  if( !it || *it != '.' ) {
148  throw_invalid_arg_( it );
149  }
150  }
151  } else if( *it == ' ' || *it == '\t' ) {
152  if( colon ) {
153  throw_invalid_arg_( it );
154  }
155  } else {
156  throw_invalid_arg_( it );
157  }
158  ++it;
159 
160  // Now read the end position. Throws when there is no number.
161  region.end = parse_unsigned_integer<size_t>( it );
162  }
163 
164  // After the above, we must be at the end of the line, otherwise that's an error.
165  if( it && *it != '\n' ) {
166  throw_invalid_arg_( it );
167  }
168 
169  // Fix coordinates as needed.
170  if( zero_based_ ) {
171  ++region.start;
172  ++region.end;
173  }
174  if( end_exclusive_ ) {
175  if( region.end == 0 ) {
176  throw_invalid_arg_( it );
177  }
178  --region.end;
179  }
180 
181  // Validity check.
182  if( ! region.specified() ) {
183  throw_invalid_arg_( it );
184  }
185 
186  // Call the callback, e.g., to add the region to the target list.
187  callback( region );
188 
189  // Move to next line.
190  assert( !it || *it == '\n' );
191  ++it;
192  }
193 }
194 
195 } // namespace population
196 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
parser.hpp
genome_region_reader.hpp
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::population::GenomeRegionReader::read_as_genome_locus_set
GenomeLocusSet read_as_genome_locus_set(std::shared_ptr< utils::BaseInputSource > source) const
Read an input source, and return its content as a GenomeLocusSet.
Definition: genome_region_reader.cpp:50
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:74
genesis::utils
Definition: placement/formats/edge_color.hpp:42
string.hpp
Provides some commonly used string utility functions.
genesis::utils::is_print
constexpr bool is_print(char c) noexcept
Return whether a char is a printable character, according to isprint of the cctype header,...
Definition: char.hpp:209
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
convert.hpp
char.hpp
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:70
genesis::population::GenomeRegionReader::read_as_genome_region_list
GenomeRegionList read_as_genome_region_list(std::shared_ptr< utils::BaseInputSource > source, bool merge=false) const
Read an input source, and return its content as a GenomeRegionList.
Definition: genome_region_reader.cpp:60
scanner.hpp
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127