A library for working with phylogenetic and population genetic data.
v0.27.0
genome_region.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <iostream>
39 #include <stdexcept>
40 
41 namespace genesis {
42 namespace population {
43 
44 // =================================================================================================
45 // Comparion Operators
46 // =================================================================================================
47 
48 bool operator ==( GenomeRegion const& a, GenomeRegion const& b )
49 {
50  return a.chromosome == b.chromosome && a.start == b.start && a.end == b.end;
51 }
52 
53 bool operator !=( GenomeRegion const& a, GenomeRegion const& b )
54 {
55  return !( a == b );
56 }
57 
58 // =================================================================================================
59 // Parsing & Printing
60 // =================================================================================================
61 
62 std::ostream& operator<<( std::ostream& os, GenomeRegion const& region )
63 {
64  os << to_string( region );
65  // os << region.chromosome << ": [" << region.start << ", " << region.end << ")";
66  return os;
67 }
68 
69 std::string to_string( GenomeRegion const& region )
70 {
71  // Error cases. We need to check separately here, as we want to be able to treat the
72  // start == end == 0 special case extra here, and just print out the chromosome in that case.
73  if( region.chromosome.empty() ) {
74  throw std::invalid_argument(
75  "Invalid GenomeRegion with empty chromosome."
76  );
77  }
78  if(( region.start == 0 ) != ( region.end == 0 )) {
79  throw std::invalid_argument(
80  "Invalid GenomeRegion with one of start and end equal to zero."
81  );
82  }
83  if( region.start > region.end ) {
84  throw std::invalid_argument(
85  "Invalid GenomeRegion with start > end."
86  );
87  }
88 
89  // Special cases.
90  if( region.start == 0 && region.end == 0 ) {
91  return region.chromosome;
92  } else if( region.start == region.end ) {
93  return region.chromosome + ":" + std::to_string( region.start );
94  }
95 
96  // General case.
97  return
98  region.chromosome + ":" +
99  std::to_string( region.start ) + "-" +
100  std::to_string( region.end )
101  ;
102 }
103 
104 GenomeRegion parse_genome_region( std::string const& region, bool zero_based, bool end_exclusive )
105 {
106  GenomeRegion result;
107 
108  // Helper function to throw on error without copies of the same error message each time.
109  auto throw_invalid_arg_ = [&](){
110  throw std::invalid_argument( "Invalid genomic region string \"" + region + "\"" );
111  };
112 
113  // Split by chromosome delimitier and store the chrom. Every string at least yields one split.
114  auto const chr_split = utils::split( region, ":", false );
115  assert( chr_split.size() > 0 );
116  result.chromosome = chr_split[0];
117 
118  // Special cases where either everything is empty, or parts are.
119  if( result.chromosome.empty() || result.chromosome == "-" || result.chromosome == ".." ) {
120  throw_invalid_arg_();
121  }
122 
123  // If there is a part after the `:`, use that for positions.
124  if( chr_split.size() == 2 ) {
125 
126  // Try to split by "-", or if that does not work, try ".." instead.
127  auto pos_split = utils::split( chr_split[1], "-", false );
128  if( pos_split.size() == 1 ) {
129  pos_split = utils::split_at( chr_split[1], "..", false );
130  }
131 
132  // If there is no delimiter, use the number for both start and end.
133  if( pos_split.size() == 1 ) {
134  // Did neither find "-" nor "..". Use position as both start and end.
135  auto const pos = utils::convert_from_string<size_t>( pos_split[0], true );
136  result.start = pos;
137  result.end = pos;
138  } else if( pos_split.size() == 2 ) {
139  // Found a valid split by "-" or "..".
140  // If either part is empty, error. Otherwise, convert.
141  if( pos_split[0].empty() || pos_split[1].empty() ) {
142  throw_invalid_arg_();
143  }
144  result.start = utils::convert_from_string<size_t>( pos_split[0], true );
145  result.end = utils::convert_from_string<size_t>( pos_split[1], true );
146  } else {
147  throw_invalid_arg_();
148  }
149 
150  // Fix coordinates if needed.
151  if( zero_based ) {
152  ++result.start;
153  ++result.end;
154  }
155  if( end_exclusive ) {
156  if( result.end == 0 ) {
157  throw_invalid_arg_();
158  }
159  --result.end;
160  }
161 
162  // Validity check.
163  if( ! result.valid() ) {
164  throw_invalid_arg_();
165  }
166  } else if( chr_split.size() > 2 ) {
167  // Multiple ":" found.
168  throw_invalid_arg_();
169  }
170  return result;
171 }
172 
174  std::string const& regions, bool zero_based, bool end_exclusive
175 ) {
176  GenomeRegionList result;
177 
178  auto const region_list = utils::split( regions, ",", false );
179  for( auto const& region : region_list ) {
180  result.add( parse_genome_region( utils::trim( region ), zero_based, end_exclusive ));
181  }
182 
183  return result;
184 }
185 
186 // =================================================================================================
187 // Region Coverage
188 // =================================================================================================
189 
190 bool is_covered( GenomeRegion const& region, std::string const& chromosome, size_t position )
191 {
192  if( region.start > region.end ) {
193  throw std::runtime_error( "Invalid GenomeRegion with start > end" );
194  }
195 
196  if( region.start > 0 && region.end > 0 ) {
197  // With proper start and/or end, all has to match.
198  auto const chr = chromosome == region.chromosome;
199  auto const beg = position >= region.start;
200  auto const end = position <= region.end;
201  return chr && beg && end;
202  } else if( region.start == 0 && region.end == 0 ) {
203  // If both start and end are zero, we are just matching the chromosome.
204  return chromosome == region.chromosome;
205  }
206 
207  // Edge error case, could throw here as well.
208  assert( region.start == 0 || region.end == 0 );
209  return false;
210 }
211 
212 bool is_covered( GenomeRegionList const& regions, std::string const& chromosome, size_t position )
213 {
214  return regions.is_covered( chromosome, position );
215 }
216 
217 #ifdef GENESIS_HTSLIB
218 
219 bool is_covered( GenomeRegion const& region, VcfRecord const& variant )
220 {
221  return is_covered( region, variant.get_chromosome(), variant.get_position() );
222 }
223 
224 bool is_covered( GenomeRegionList const& regions, VcfRecord const& variant )
225 {
226  return is_covered( regions, variant.get_chromosome(), variant.get_position() );
227 }
228 
229 #endif // htslib guard
230 
231 } // namespace population
232 } // namespace genesis
genesis::population::operator!=
bool operator!=(GenomeLocus const &l, GenomeLocus const &r)
Inequality comparison (!=) for two loci in a genome.
Definition: functions/genome_locus.hpp:261
genesis::population::operator==
bool operator==(GenomeLocus const &l, GenomeLocus const &r)
Equality comparison (==) for two loci in a genome.
Definition: functions/genome_locus.hpp:209
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:602
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:82
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:65
genesis::population::is_covered
bool is_covered(GenomeRegion const &region, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
Definition: genome_region.cpp:190
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:64
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, BaseCounts const &bs)
Output stream operator for BaseCounts instances.
Definition: population/functions/functions.cpp:486
genesis::population::GenomeRegion::valid
bool valid() const
Definition: genome_region.hpp:86
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
string.hpp
Provides some commonly used string utility functions.
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome, numerical_type start, numerical_type end, bool overlap=false)
Add a GenomeRegion to the list, given its chromosome, and start and end positions.
Definition: genome_region_list.hpp:130
genesis::population::VcfRecord::get_chromosome
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
Definition: vcf_record.cpp:170
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeRegionList::is_covered
bool is_covered(std::string const &chromosome, numerical_type position) const
Return whether a given position on a chromosome is part of any of the regions stored.
Definition: genome_region_list.hpp:273
genesis::population::parse_genome_regions
GenomeRegionList parse_genome_regions(std::string const &regions, bool zero_based, bool end_exclusive)
Parse a set/list of genomic regions.
Definition: genome_region.cpp:173
convert.hpp
genesis::population::VcfRecord::get_position
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
Definition: vcf_record.cpp:181
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:60
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: genome_region.cpp:69
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::utils::split_at
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:424
genome_region.hpp
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:66
genesis::utils::split
std::vector< std::string > split(std::string const &str, std::string const &delimiters, const bool trim_empty)
Spilt a string into parts, given a delimiters set of chars.
Definition: string.cpp:386
genesis::population::parse_genome_region
GenomeRegion parse_genome_region(std::string const &region, bool zero_based, bool end_exclusive)
Parse a genomic region.
Definition: genome_region.cpp:104