A library for working with phylogenetic data.
v0.25.0
genome_region.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
31 #include <algorithm>
32 #include <cassert>
33 #include <iostream>
34 #include <stdexcept>
35 
37 
38 namespace genesis {
39 namespace population {
40 
41 // =================================================================================================
42 // Genome Region List
43 // =================================================================================================
44 
45 void GenomeRegionList::add( GenomeRegion const& region )
46 {
47  // Sanity check
48  if( region.start >= region.end ) {
49  throw std::runtime_error( "Invalid GenomeRegion with start >= end" );
50  }
51 
52  // Find the position in the already given regions at which we need to insert the new one.
53  auto const pos = std::lower_bound( regions_.begin(), regions_.end(), region, [](
54  GenomeRegion const& lhs, GenomeRegion const& rhs
55  ){
56  // The comp operator is called with `lhs` being the iterated elements of `regions_`,
57  // and rhs being the new `region` to be inserted. The operator is 'less than', which we here
58  // want to mean: if the chromosomes differ, we sort by chromosome. Next, on the same
59  // chromosome, `lhs` is less than `rhs` iff it comes completely before `rhs`.
60  // Any overlap between the two is forbidden, but we still stop in that case and then later
61  // check for that. So, in all cases where the end of `lhs` is after the start of `rhs`,
62  // we want to stop.
63  // Alternative way of thinking about this: In the search, move to the next region in the list
64  // as long as the condition is true, that is, as long as (on the same chromosome) the
65  // region in the list is completely (start and end) before the given new region.
66  return
67  lhs.chromosome < rhs.chromosome ||
68  ( lhs.chromosome == rhs.chromosome && lhs.end <= rhs.start )
69  ;
70  });
71 
72  // Check for overlap. We only need to check one condition here, as we already implicitly
73  // checked the other above when finding the insertion position. But let's assert this.
74  if( pos != regions_.end() && pos->start < region.end ) {
75  assert( region.chromosome == pos->chromosome );
76  assert( pos->end > region.start );
77  throw std::runtime_error(
78  "Cannot add GenomeRegion to GenomeRegionList: Invalid GenomeRegion [ " +
79  std::to_string( region.start ) + ", " + std::to_string( region.end ) +
80  " ) that overlaps with existing GenomeRegion [ " + std::to_string( pos->start ) +
81  ", " + std::to_string( pos->end ) + " ) on chromosome " + region.chromosome
82  );
83  }
84 
85  // If the above check passed, let's assert that the region is in front of pos,
86  // at least as long as we found a position and that position is on the same chromosome.
87  assert( pos == regions_.end() || region.chromosome != pos->chromosome || region.end <= pos->start );
88 
89  // Finally, insert the new element.
90  regions_.insert( pos, region );
91 
92  // Set the cache to a valid iterator.
93  find_cache_ = regions_.end();
94 
95  // Assert that all regions are in order.
96  assert(
97  [&](){
98  std::string chrom;
99  size_t pos = 0;
100  for( auto const& r : regions_ ) {
101  if( r.chromosome != chrom ) {
102  chrom = r.chromosome;
103  pos = 0;
104  }
105  if( r.start < pos || r.end <= pos ) {
106  return false;
107  }
108  pos = r.end;
109  }
110  return true;
111  }()
112  );
113 }
114 
115 void GenomeRegionList::add( std::string const& chromosome, size_t start, size_t end )
116 {
117  add( GenomeRegion( chromosome, start, end ));
118 }
119 
120 bool GenomeRegionList::is_covered( std::string const& chromosome, size_t position ) const
121 {
122  return find( chromosome, position ) != regions_.end();
123 }
124 
126  std::string const& chromosome, size_t position
127 ) const {
128 
129  // We use a cache for the region that was used last time.
130  // In many cases, we query adjacent positions, and hence this should yield quite speedup.
131  if(
132  find_cache_ != regions_.end() && find_cache_->chromosome == chromosome &&
133  find_cache_->start <= position && position < find_cache_->end
134  ) {
135  return find_cache_;
136  }
137 
138  // Find the chromosome and position using binary search.
139  // We are looking for the first interval that is not completely smaller (start and end)
140  // than the given position. Think of the following lambda as stating "return true as long
141  // as we want to move on to the next region in the list".
142  auto const pos = std::lower_bound( regions_.begin(), regions_.end(), position, [&](
143  GenomeRegion const& lhs, size_t position
144  ){
145  assert( lhs.start < lhs.end );
146  return
147  lhs.chromosome < chromosome ||
148  ( lhs.chromosome == chromosome && lhs.end <= position )
149  ;
150  });
151 
152  // Nothing found that fits
153  if( pos == regions_.end() ) {
154  return regions_.end();
155  }
156 
157  // Found the first interval not smaller than the given position.
158  // Let's test whether position is within this interval.
159  assert( pos->chromosome == chromosome );
160  assert( position < pos->end );
161  if( position >= pos->start ) {
162  find_cache_ = pos;
163  return pos;
164  }
165  return regions_.end();
166 
167  // Simple version without binary search:
168  // for( auto it = regions_.cbegin(); it != regions_.end(); ++it ) {
169  // if( it->chromosome != chromosome ) {
170  // continue;
171  // }
172  // if( position >= it->start && position < it->end ) {
173  // return it;
174  // }
175  // }
176  // return regions_.end();
177 }
178 
179 } // namespace population
180 } // namespace genesis
genesis::population::GenomeRegionList::end
const_iterator end() const
Definition: genome_region.hpp:192
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:57
genesis::population::GenomeRegionList::is_covered
bool is_covered(std::string const &chromosome, size_t position) const
Return whether a given position on a chromosome is part of any of the regions stored.
Definition: genome_region.cpp:120
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:56
genome_region.hpp
genesis::population::GenomeRegionList::const_iterator
typename container::const_iterator const_iterator
Definition: genome_region.hpp:95
genesis::population::GenomeRegionList::find
const_iterator find(std::string const &chromosome, size_t position) const
Return an iterator to the GenomeRegion that covers the given position on the chromosome,...
Definition: genome_region.cpp:125
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:51
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::population::GenomeRegionList::add
void add(GenomeRegion const &region)
Add a GenomeRegion to the list.
Definition: genome_region.cpp:45
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:58