A library for working with phylogenetic data.
v0.25.0
functions/genome_region.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <iostream>
39 #include <stdexcept>
40 
41 namespace genesis {
42 namespace population {
43 
44 // =================================================================================================
45 // Genome Region
46 // =================================================================================================
47 
48 std::ostream& operator<<( std::ostream& os, GenomeRegion const& region )
49 {
50  os << to_string( region );
51  // os << region.chromosome << ": [" << region.start << ", " << region.end << ")";
52  return os;
53 }
54 
55 std::string to_string( GenomeRegion const& region )
56 {
57  if( region.start == 0 && region.end == 0 ) {
58  return region.chromosome;
59  } else if( region.start == region.end ) {
60  return region.chromosome + ":" + std::to_string( region.start );
61  } else {
62  return
63  region.chromosome + ":" +
64  std::to_string( region.start ) + "-" +
65  std::to_string( region.end )
66  ;
67  }
68 }
69 
70 bool operator ==( GenomeRegion const& a, GenomeRegion const& b )
71 {
72  return a.chromosome == b.chromosome && a.start == b.start && a.end == b.end;
73 }
74 
75 GenomeRegion parse_genome_region( std::string const& region )
76 {
77  GenomeRegion result;
78 
79  // Helper function to throw on error without copies of the same error message each time.
80  auto throw_invalid_arg_ = [&](){
81  throw std::invalid_argument( "Invalid genomic region string '" + region + "'" );
82  };
83 
84  // Split by chromosome delimitier and store the chrom. Every string at least yields one split.
85  auto const chr_split = utils::split( region, ":", false );
86  assert( chr_split.size() > 0 );
87  result.chromosome = chr_split[0];
88 
89  // Special cases where either everything is empty, or parts are.
90  if( result.chromosome.empty() || result.chromosome == "-" || result.chromosome == ".." ) {
91  throw_invalid_arg_();
92  }
93 
94  // If there is a part after the `:`, use that for positions.
95  if( chr_split.size() == 2 ) {
96 
97  // Try to split by "-", or if that does not work, try ".." instead.
98  auto pos_split = utils::split( chr_split[1], "-", false );
99  if( pos_split.size() == 1 ) {
100  pos_split = utils::split_at( chr_split[1], "..", false );
101  }
102 
103  // If there is no delimiter, use the number for both start and end.
104  if( pos_split.size() == 1 ) {
105  // Did neither find "-" nor "..". Use position as both start and end.
106  auto const pos = utils::convert_from_string<size_t>( pos_split[0], true );
107  result.start = pos;
108  result.end = pos;
109  } else if( pos_split.size() == 2 ) {
110  // Found a valid split by "-" or "..".
111  // If either part is empty, error. Otherwise, convert.
112  if( pos_split[0].empty() || pos_split[1].empty() ) {
113  throw_invalid_arg_();
114  }
115  result.start = utils::convert_from_string<size_t>( pos_split[0], true );
116  result.end = utils::convert_from_string<size_t>( pos_split[1], true );
117  } else {
118  throw_invalid_arg_();
119  }
120 
121  // Validity check.
122  if( result.start > result.end ) {
123  throw_invalid_arg_();
124  }
125  } else if( chr_split.size() > 2 ) {
126  // Multiple ":" found.
127  throw_invalid_arg_();
128  }
129  return result;
130 }
131 
132 GenomeRegionList parse_genome_regions( std::string const& regions )
133 {
134  GenomeRegionList result;
135 
136  auto const region_list = utils::split( regions, ",", false );
137  for( auto const& region : region_list ) {
138  result.add( parse_genome_region( utils::trim( region )));
139  }
140 
141  return result;
142 }
143 
144 bool is_covered( GenomeRegion const& region, std::string const& chromosome, size_t position )
145 {
146  if( region.start > region.end ) {
147  throw std::runtime_error( "Invalid GenomeRegion with start > end" );
148  }
149 
150  if( region.start > 0 || region.end > 0 ) {
151  // With proper start and/or end, all has to match.
152  auto const chr = chromosome == region.chromosome;
153  auto const beg = position >= region.start;
154  auto const end = position <= region.end;
155  return chr && beg && end;
156  } else {
157  // If both start and end are zero, we are just matching the chromosome.
158  assert( region.start == 0 && region.end == 0 );
159  return chromosome == region.chromosome;
160  }
161 }
162 
163 #ifdef GENESIS_HTSLIB
164 
165 bool is_covered( GenomeRegion const& region, VcfRecord const& variant )
166 {
167  return is_covered( region, variant.get_chromosome(), variant.get_position() );
168 }
169 
170 #endif // htslib guard
171 
172 } // namespace population
173 } // namespace genesis
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:602
genesis::population::GenomeRegionList
A sorted list of genome regions.
Definition: genome_region.hpp:79
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:57
genesis::population::is_covered
bool is_covered(GenomeRegion const &region, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
Definition: functions/genome_region.cpp:144
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:56
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, BaseCounts const &bs)
Output stream operator for BaseCounts instances.
Definition: base_counts.cpp:345
genesis::population::parse_genome_region
GenomeRegion parse_genome_region(std::string const &region)
Parse a genomic region.
Definition: functions/genome_region.cpp:75
string.hpp
Provides some commonly used string utility functions.
genesis::population::parse_genome_regions
GenomeRegionList parse_genome_regions(std::string const &regions)
Parse a set/list of genomic regions.
Definition: functions/genome_region.cpp:132
genesis::population::operator==
bool operator==(GenomeRegion const &a, GenomeRegion const &b)
Definition: functions/genome_region.cpp:70
genesis::population::VcfRecord::get_chromosome
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
Definition: vcf_record.cpp:170
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
convert.hpp
genesis::population::VcfRecord::get_position
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
Definition: vcf_record.cpp:175
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:51
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::utils::split_at
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:424
genome_region.hpp
genesis::population::GenomeRegionList::add
void add(GenomeRegion const &region)
Add a GenomeRegion to the list.
Definition: genome_region.cpp:45
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:58
genesis::utils::split
std::vector< std::string > split(std::string const &str, std::string const &delimiters, const bool trim_empty)
Spilt a string into parts, given a delimiters set of chars.
Definition: string.cpp:386