A library for working with phylogenetic and population genetic data.
v0.32.0
genome_region.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <iostream>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Comparion Operators
49 // =================================================================================================
50 
51 bool operator ==( GenomeRegion const& a, GenomeRegion const& b )
52 {
53  return a.chromosome == b.chromosome && a.start == b.start && a.end == b.end;
54 }
55 
56 bool operator !=( GenomeRegion const& a, GenomeRegion const& b )
57 {
58  return !( a == b );
59 }
60 
61 // =================================================================================================
62 // Parsing & Printing
63 // =================================================================================================
64 
65 std::ostream& operator<<( std::ostream& os, GenomeRegion const& region )
66 {
67  os << to_string( region );
68  // os << region.chromosome << ": [" << region.start << ", " << region.end << ")";
69  return os;
70 }
71 
72 std::string to_string( GenomeRegion const& region )
73 {
74  // Error cases. We need to check separately here, as we want to be able to treat the
75  // start == end == 0 special case extra here, and just print out the chromosome in that case.
76  if( region.chromosome.empty() ) {
77  throw std::invalid_argument(
78  "Invalid GenomeRegion with empty chromosome."
79  );
80  }
81  if(( region.start == 0 ) != ( region.end == 0 )) {
82  throw std::invalid_argument(
83  "Invalid GenomeRegion with one of start and end equal to zero."
84  );
85  }
86  if( region.start > region.end ) {
87  throw std::invalid_argument(
88  "Invalid GenomeRegion with start > end."
89  );
90  }
91 
92  // Special cases.
93  if( region.start == 0 && region.end == 0 ) {
94  return region.chromosome;
95  } else if( region.start == region.end ) {
96  return region.chromosome + ":" + std::to_string( region.start );
97  }
98 
99  // General case.
100  return
101  region.chromosome + ":" +
102  std::to_string( region.start ) + "-" +
103  std::to_string( region.end )
104  ;
105 }
106 
107 GenomeRegion parse_genome_region( std::string const& region, bool zero_based, bool end_exclusive )
108 {
109  GenomeRegion result;
110 
111  // Helper function to throw on error without copies of the same error message each time.
112  auto throw_invalid_arg_ = [&](){
113  throw std::invalid_argument( "Invalid genomic region string \"" + region + "\"" );
114  };
115 
116  // Helper function to convert a string to a position, with a nicer error message.
117  auto convert_str_to_pos_ = [&]( std::string const& str ){
118  size_t res = 0;
119  try {
120  res = utils::convert_from_string<size_t>( str, true );
121  } catch( ... ) {
122  throw_invalid_arg_();
123  }
124  return res;
125  };
126 
127  // Split by chromosome delimitier and store the chrom. Every string at least yields one split.
128  auto const chr_split = utils::split( region, ":", false );
129  assert( chr_split.size() > 0 );
130  result.chromosome = chr_split[0];
131 
132  // Special cases where either everything is empty, or parts are.
133  if( result.chromosome.empty() || result.chromosome == "-" || result.chromosome == ".." ) {
134  throw_invalid_arg_();
135  }
136 
137  // If there is a part after the `:`, use that for positions.
138  if( chr_split.size() == 2 ) {
139 
140  // Try to split by "-", or if that does not work, try ".." instead.
141  auto pos_split = utils::split( chr_split[1], "-", false );
142  if( pos_split.size() == 1 ) {
143  pos_split = utils::split_at( chr_split[1], "..", false );
144  }
145 
146  // If there is no delimiter, use the number for both start and end.
147  if( pos_split.size() == 1 ) {
148  // Did neither find "-" nor "..". Use position as both start and end.
149  if( pos_split[0].empty() ) {
150  throw_invalid_arg_();
151  }
152  auto const pos = convert_str_to_pos_( pos_split[0] );
153  result.start = pos;
154  result.end = pos;
155  } else if( pos_split.size() == 2 ) {
156  // Found a valid split by "-" or "..".
157  // If either part is empty, error. Otherwise, convert.
158  if( pos_split[0].empty() || pos_split[1].empty() ) {
159  throw_invalid_arg_();
160  }
161  result.start = convert_str_to_pos_( pos_split[0] );
162  result.end = convert_str_to_pos_( pos_split[1] );
163  } else {
164  throw_invalid_arg_();
165  }
166 
167  // Fix coordinates if needed.
168  if( zero_based ) {
169  ++result.start;
170  ++result.end;
171  }
172  if( end_exclusive ) {
173  if( result.end == 0 ) {
174  throw_invalid_arg_();
175  }
176  --result.end;
177  }
178 
179  // Validity check.
180  if( ! result.specified() ) {
181  throw_invalid_arg_();
182  }
183  } else if( chr_split.size() > 2 ) {
184  // Multiple ":" found.
185  throw_invalid_arg_();
186  }
187  return result;
188 }
189 
191  std::string const& regions, bool zero_based, bool end_exclusive
192 ) {
193  GenomeRegionList result;
194 
195  auto const region_list = utils::split( regions, ",", false );
196  for( auto const& region : region_list ) {
197  result.add( parse_genome_region( utils::trim( region ), zero_based, end_exclusive ));
198  }
199 
200  return result;
201 }
202 
203 // =================================================================================================
204 // Region Coverage
205 // =================================================================================================
206 
207 bool is_covered( GenomeRegion const& region, std::string const& chromosome, size_t position )
208 {
209  if( region.start > region.end ) {
210  throw std::runtime_error( "Invalid GenomeRegion with start > end" );
211  }
212 
213  if( region.start > 0 && region.end > 0 ) {
214  // With proper start and/or end, all has to match.
215  auto const chr = chromosome == region.chromosome;
216  auto const beg = position >= region.start;
217  auto const end = position <= region.end;
218  return chr && beg && end;
219  } else if( region.start == 0 && region.end == 0 ) {
220  // If both start and end are zero, we are just matching the chromosome.
221  return chromosome == region.chromosome;
222  }
223 
224  // Edge error case, could throw here as well.
225  assert( region.start == 0 || region.end == 0 );
226  return false;
227 }
228 
229 } // namespace population
230 } // namespace genesis
genome_region.hpp
genesis::population::operator!=
bool operator!=(GenomeLocus const &l, GenomeLocus const &r)
Inequality comparison (!=) for two loci in a genome.
Definition: function/genome_locus.hpp:396
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, SampleCounts const &bs)
Output stream operator for SampleCounts instances.
Definition: population/function/functions.cpp:649
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
genesis::population::operator==
bool operator==(GenomeLocus const &l, GenomeLocus const &r)
Equality comparison (==) for two loci in a genome.
Definition: function/genome_locus.hpp:340
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces (or any other delimiters).
Definition: string.cpp:827
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:75
genesis::population::is_covered
bool is_covered(GenomeRegion const &region, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
Definition: genome_region.cpp:207
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:74
input_source.hpp
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::parse_genome_regions
GenomeRegionList parse_genome_regions(std::string const &regions, bool zero_based, bool end_exclusive)
Parse a set/list of genomic regions.
Definition: genome_region.cpp:190
convert.hpp
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:70
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: genome_region.cpp:72
scanner.hpp
genesis::utils::split_at
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:621
genesis::population::GenomeRegion::specified
bool specified() const
Definition: genome_region.hpp:92
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:76
genesis::population::parse_genome_region
GenomeRegion parse_genome_region(std::string const &region, bool zero_based, bool end_exclusive)
Parse a genomic region.
Definition: genome_region.cpp:107