A library for working with phylogenetic and population genetic data.
v0.27.0
genome_region_list.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_GENOME_REGION_LIST_H_
2 #define GENESIS_POPULATION_GENOME_REGION_LIST_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #include <cstdint>
35 #include <map>
36 #include <stdexcept>
37 #include <string>
38 #include <vector>
39 
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Genome Data
49 // =================================================================================================
50 
56 {};
57 
58 // =================================================================================================
59 // Genome Region List
60 // =================================================================================================
61 
81 // template<class DataType = EmptyGenomeData>
83 {
84 public:
85 
86  // -------------------------------------------------------------------------
87  // Typedefs and Enums
88  // -------------------------------------------------------------------------
89 
90  // using data_type = DataType;
92  using numerical_type = size_t;
95  >;
97 
98  using iterator = typename tree_type::iterator;
100 
101  // -------------------------------------------------------------------------
102  // Constructors and Rule of Five
103  // -------------------------------------------------------------------------
104 
105  GenomeRegionList() = default;
106  ~GenomeRegionList() = default;
107 
108  GenomeRegionList( GenomeRegionList const& ) = default;
109  GenomeRegionList( GenomeRegionList&& ) = default;
110 
111  GenomeRegionList& operator= ( GenomeRegionList const& ) = default;
113 
114  // -------------------------------------------------------------------------
115  // Modifiers
116  // -------------------------------------------------------------------------
117 
118  // -------------------------------------------
119  // Add Manually
120  // -------------------------------------------
121 
130  void add(
131  std::string const& chromosome,
132  numerical_type start,
133  numerical_type end,
134  bool overlap = false
135  ) {
136  // Check chromosome.
137  if( chromosome.empty() ) {
138  throw std::invalid_argument(
139  "Cannot add region to GenomeRegionList with empty chromosome name, "
140  "as this denotes an invalid chromosome."
141  );
142  }
143 
144  // Check positions.
145  // The start and end are also checked in the interval tree, but let's do it here
146  // so that the error message is nicer in case they are wrong.
147  if( start > end ) {
148  throw std::invalid_argument(
149  "Cannot add region to GenomeRegionList with start == " +
150  std::to_string( start ) + " > end == " + std::to_string( end )
151  );
152  }
153  if( start == 0 || end == 0 ) {
154  throw std::invalid_argument(
155  "Cannot add region to GenomeRegionList with start == 0 or end == 0, "
156  "as these denote invalid positions."
157  );
158  }
159 
160  // Insert, either by merging with an existing, or as a new region.
161  // We just get the chromosome from the map via array access, which will create it
162  // if it is not yet present.
163  if( overlap ) {
164  regions_[ chromosome ].insert_overlap({ start, end });
165  } else {
166  regions_[ chromosome ].insert({ start, end });
167  }
168  }
169 
170  // -------------------------------------------
171  // Add Locus
172  // -------------------------------------------
173 
183  void add( GenomeLocus const& locus, bool overlap = false )
184  {
185  add( locus.chromosome, locus.position, locus.position, overlap );
186  }
187 
193  void add( GenomeLocus const& start, GenomeLocus const& end, bool overlap = false )
194  {
195  if( start.chromosome != end.chromosome ) {
196  throw std::invalid_argument(
197  "Cannot use two GenomeLocus instances with different chromosomes ( start == \"" +
198  start.chromosome + "\", end == \"" + end.chromosome + "\") as an entry in a "
199  "GenomeRegionList."
200  );
201  }
202  add( start.chromosome, start.position, end.position, overlap );
203  }
204 
205  // -------------------------------------------
206  // Add Region
207  // -------------------------------------------
208 
216  void add( GenomeRegion const& region, bool overlap = false )
217  {
218  add( region.chromosome, region.start, region.end, overlap );
219  }
220 
221  // -------------------------------------------
222  // Add Region List
223  // -------------------------------------------
224 
232  void add( GenomeRegionList const& other, bool overlap = false )
233  {
234  for( auto const& chr : other.regions_ ) {
235  for( auto const& interval : chr.second ) {
236  add( chr.first, interval.low(), interval.high(), overlap );
237  }
238  }
239  }
240 
241  // -------------------------------------------
242  // Clear
243  // -------------------------------------------
244 
248  void clear()
249  {
250  regions_.clear();
251  }
252 
256  void clear( std::string const& chromosome )
257  {
258  if( regions_.count( chromosome ) == 0 ) {
259  throw std::invalid_argument(
260  "Chromosome name \"" + chromosome + "\" not found in GenomeRegionList"
261  );
262  }
263  regions_.erase( chromosome );
264  }
265 
266  // -------------------------------------------------------------------------
267  // Locus Queries
268  // -------------------------------------------------------------------------
269 
273  bool is_covered( std::string const& chromosome, numerical_type position ) const
274  {
275  // Using find(), so we only have to search in the map once, for speed.
276  auto const it = regions_.find( chromosome );
277  if( it == regions_.end() ) {
278  return false;
279  }
280  auto const& chrom_tree = it->second;
281  return chrom_tree.overlap_find( position ) != chrom_tree.end();
282  }
283 
284  // Not used at the moment, as we have no access to the end iterator to check for a valid find.
285 
286  // /* *
287  // * @brief Return an iterator to the GenomeRegion that covers the given position on the
288  // * chromosome, or an end iterator if there is no region that covers the position.
289  // *
290  // * Throws an exception if the chromosome is not in the list at all.
291  // */
292  // const_iterator find( std::string const& chromosome, size_t position ) const
293  // {
294  // // Using find(), so we only have to search in the map once, for speed.
295  // auto const it = regions_.find( chromosome );
296  // if( it == regions_.end() ) {
297  // throw std::invalid_argument(
298  // "GenomeRegionList does not contain chromosome \"" + chromosome + "\""
299  // );
300  // }
301  // auto const& chrom_tree = it->second;
302  // return chrom_tree.overlap_find( position );
303  // }
304 
305  // -------------------------------------------------------------------------
306  // Chromosome Accessors
307  // -------------------------------------------------------------------------
308 
312  bool empty() const
313  {
314  return regions_.empty();
315  }
316 
320  size_t chromosome_count() const
321  {
322  return regions_.size();
323  }
324 
328  std::vector<std::string> chromosome_names() const
329  {
330  std::vector<std::string> result;
331  for( auto const& p : regions_ ) {
332  result.push_back( p.first );
333  }
334  return result;
335  }
336 
340  bool has_chromosome( std::string const& chromosome ) const
341  {
342  return regions_.count( chromosome ) > 0;
343  }
344 
349  tree_type const& chromosome_regions( std::string const& chromosome ) const
350  {
351  return regions_.at( chromosome );
352  }
353 
358  tree_type& chromosome_regions( std::string const& chromosome )
359  {
360  return regions_.at( chromosome );
361  }
362 
366  size_t region_count( std::string const& chromosome ) const
367  {
368  if( regions_.count( chromosome ) == 0 ) {
369  throw std::invalid_argument(
370  "Chromosome name \"" + chromosome + "\" not found in GenomeRegionList"
371  );
372  }
373  return regions_.at( chromosome ).size();
374  }
375 
379  size_t total_region_count() const
380  {
381  size_t cnt = 0;
382  for( auto const& reg : regions_ ) {
383  cnt += reg.second.size();
384  }
385  return cnt;
386  }
387 
397  std::map<std::string, tree_type> const& chromosome_map() const
398  {
399  return regions_;
400  }
401 
405  std::map<std::string, tree_type>& chromosome_map()
406  {
407  return regions_;
408  }
409 
410  // -------------------------------------------------------------------------
411  // Iterator
412  // -------------------------------------------------------------------------
413 
414  // iterator begin()
415  // {
416  // return regions_.begin();
417  // }
418 
419  // const_iterator begin() const
420  // {
421  // return regions_.begin();
422  // }
423 
424  // iterator end()
425  // {
426  // return regions_.end();
427  // }
428 
429  // const_iterator end() const
430  // {
431  // return regions_.end();
432  // }
433 
434  // -------------------------------------------------------------------------
435  // Settings
436  // -------------------------------------------------------------------------
437 
438  // bool allow_overlap() const
439  // {
440  // return allow_overlap_;
441  // }
442  //
443  // self_type& allow_overlap( bool value )
444  // {
445  // when set this way, it will only affect newly added regions...
446  // better in constructor and const?
447  // allow_overlap_ = value;
448  // return *this;
449  // }
450 
451  // -------------------------------------------------------------------------
452  // Data Members
453  // -------------------------------------------------------------------------
454 
455 private:
456 
457  std::map<std::string, tree_type> regions_;
458 
459  // bool allow_overlap_ = true;
460 
461 };
462 
463 } // namespace population
464 } // namespace genesis
465 
466 #endif // include guard
genesis::population::GenomeRegionList::const_iterator
typename tree_type::const_iterator const_iterator
Definition: genome_region_list.hpp:99
genesis::population::GenomeRegionList::chromosome_names
std::vector< std::string > chromosome_names() const
Get a list of all stored chromosome names.
Definition: genome_region_list.hpp:328
genesis::population::GenomeRegionList::chromosome_map
std::map< std::string, tree_type > const & chromosome_map() const
Access the underlying container directly.
Definition: genome_region_list.hpp:397
genesis::population::GenomeRegionList::clear
void clear(std::string const &chromosome)
Remove the regions of the specified chromosome.
Definition: genome_region_list.hpp:256
genesis::population::GenomeRegionList::region_count
size_t region_count(std::string const &chromosome) const
Return the number of regions stored for the specified chromosome.
Definition: genome_region_list.hpp:366
genesis::population::GenomeRegionList::empty
bool empty() const
Return whether there are chromosomes with regions stored.
Definition: genome_region_list.hpp:312
genesis::population::GenomeLocus::position
size_t position
Definition: genome_locus.hpp:59
genome_locus.hpp
genesis::population::GenomeRegionList::add
void add(GenomeRegionList const &other, bool overlap=false)
Add a complete GenomeRegionList to this list.
Definition: genome_region_list.hpp:232
genesis::population::GenomeRegionList::chromosome_count
size_t chromosome_count() const
Return the number of chromosomes for which there are regions stored.
Definition: genome_region_list.hpp:320
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:56
genesis::population::GenomeRegionList::has_chromosome
bool has_chromosome(std::string const &chromosome) const
Return whether a chromosome is stored.
Definition: genome_region_list.hpp:340
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:82
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:65
genesis::population::GenomeLocus::chromosome
std::string chromosome
Definition: genome_locus.hpp:58
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:64
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genome_region.hpp
genesis::population::GenomeRegionList::data_type
EmptyGenomeData data_type
Definition: genome_region_list.hpp:91
genesis::population::GenomeRegionList::chromosome_map
std::map< std::string, tree_type > & chromosome_map()
Access the underlying container directly.
Definition: genome_region_list.hpp:405
genesis::utils::IntervalClosed
Helper type to define a closed [] Interval.
Definition: interval.hpp:208
interval_tree.hpp
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome, numerical_type start, numerical_type end, bool overlap=false)
Add a GenomeRegion to the list, given its chromosome, and start and end positions.
Definition: genome_region_list.hpp:130
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeRegionList::is_covered
bool is_covered(std::string const &chromosome, numerical_type position) const
Return whether a given position on a chromosome is part of any of the regions stored.
Definition: genome_region_list.hpp:273
genesis::population::GenomeRegionList::operator=
GenomeRegionList & operator=(GenomeRegionList const &)=default
genesis::population::GenomeRegionList::add
void add(GenomeLocus const &start, GenomeLocus const &end, bool overlap=false)
Add an interval between two Loci on the same chromosome.
Definition: genome_region_list.hpp:193
genesis::population::GenomeRegionList::chromosome_regions
tree_type const & chromosome_regions(std::string const &chromosome) const
For a given chromosome, return the IntervalTree that stores its regions.
Definition: genome_region_list.hpp:349
genesis::utils::IntervalTree::iterator
IntervalTreeIterator< node_type, false > iterator
Definition: interval_tree.hpp:96
genesis::population::GenomeRegionList::add
void add(GenomeLocus const &locus, bool overlap=false)
Add a single Locus, that is, an interval covering one position on a chromosome.
Definition: genome_region_list.hpp:183
genesis::population::EmptyGenomeData
Helper struct to define a default empty data for the classes GenomeLocus, GenomeRegion,...
Definition: genome_region_list.hpp:55
genesis::population::GenomeRegionList::GenomeRegionList
GenomeRegionList()=default
genesis::utils::IntervalTree::const_iterator
IntervalTreeIterator< node_type, true > const_iterator
Definition: interval_tree.hpp:97
genesis::population::GenomeRegionList::add
void add(GenomeRegion const &region, bool overlap=false)
Add a GenomeRegion to the list.
Definition: genome_region_list.hpp:216
genesis::population::GenomeRegionList::chromosome_regions
tree_type & chromosome_regions(std::string const &chromosome)
For a given chromosome, return the IntervalTree that stores its regions.
Definition: genome_region_list.hpp:358
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:60
genesis::population::GenomeRegionList::clear
void clear()
Remove all stored regions from all chromosomes.
Definition: genome_region_list.hpp:248
genesis::population::GenomeRegionList::~GenomeRegionList
~GenomeRegionList()=default
genesis::population::GenomeRegionList::numerical_type
size_t numerical_type
Definition: genome_region_list.hpp:92
genesis::utils::IntervalTree
Interval tree that enables storing and querying intervals, each containing some data.
Definition: fwd.hpp:45
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:66
genesis::population::GenomeRegionList::total_region_count
size_t total_region_count() const
Return the number of regions stored in total, across all chromosomes.
Definition: genome_region_list.hpp:379
genesis::population::GenomeRegionList::iterator
typename tree_type::iterator iterator
Definition: genome_region_list.hpp:98