A library for working with phylogenetic and population genetic data.
v0.32.0
genome_region_list.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_GENOME_REGION_LIST_H_
2 #define GENESIS_POPULATION_GENOME_REGION_LIST_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
34 #include <cassert>
35 #include <cstdint>
36 #include <map>
37 #include <stdexcept>
38 #include <string>
39 #include <vector>
40 
44 
45 namespace genesis {
46 namespace population {
47 
48 // =================================================================================================
49 // Genome Data
50 // =================================================================================================
51 
57 {};
58 
59 // =================================================================================================
60 // Genome Region List
61 // =================================================================================================
62 
94 // template<class DataType = EmptyGenomeData>
96 {
97 public:
98 
99  // -------------------------------------------------------------------------
100  // Typedefs and Enums
101  // -------------------------------------------------------------------------
102 
103  // using data_type = DataType;
105  using numerical_type = size_t;
108  >;
110 
111  using iterator = typename tree_type::iterator;
113 
114  // -------------------------------------------------------------------------
115  // Constructors and Rule of Five
116  // -------------------------------------------------------------------------
117 
118  GenomeRegionList() = default;
119  ~GenomeRegionList() = default;
120 
121  GenomeRegionList( GenomeRegionList const& ) = default;
122  GenomeRegionList( GenomeRegionList&& ) = default;
123 
124  GenomeRegionList& operator= ( GenomeRegionList const& ) = default;
126 
127  // -------------------------------------------------------------------------
128  // Modifiers
129  // -------------------------------------------------------------------------
130 
131  // -------------------------------------------
132  // Add Manually
133  // -------------------------------------------
134 
139  void add( std::string const& chromosome )
140  {
141  // We use the special value 0 to denote that we want the whole chromosome.
142  add( chromosome, 0, 0 );
143  }
144 
154  void add(
155  std::string const& chromosome,
156  numerical_type start,
157  numerical_type end,
158  bool overlap = false
159  ) {
160  // Check chromosome.
161  if( chromosome.empty() ) {
162  throw std::invalid_argument(
163  "Cannot add region to GenomeRegionList with empty chromosome name, "
164  "as this denotes an invalid chromosome."
165  );
166  }
167 
168  // Check positions.
169  // The start and end are also checked in the interval tree, but let's do it here
170  // so that the error message is nicer in case they are wrong.
171  if( start > end ) {
172  throw std::invalid_argument(
173  "Cannot add region to GenomeRegionList with start == " +
174  std::to_string( start ) + " > end == " + std::to_string( end )
175  );
176  }
177  if(( start == 0 ) ^ ( end == 0 )) {
178  throw std::invalid_argument(
179  "Cannot add region to GenomeRegionList with either start == 0 or end == 0, "
180  "but not both, as we use 1-base indexing, with both being 0 being interpreted "
181  "as the special case of denoting the whole chromosome. "
182  "Hence either both start and end have to be 0, or neither."
183  );
184  }
185 
186  // Insert, either by merging with an existing, or as a new region.
187  // We just get the chromosome from the map via array access, which will create it
188  // if it is not yet present.
189  if( overlap ) {
190  regions_[ chromosome ].insert_overlap({ start, end });
191  } else {
192  regions_[ chromosome ].insert({ start, end });
193  }
194  }
195 
196  // -------------------------------------------
197  // Add Locus
198  // -------------------------------------------
199 
209  void add( GenomeLocus const& locus, bool overlap = false )
210  {
211  add( locus.chromosome, locus.position, locus.position, overlap );
212  }
213 
219  void add( GenomeLocus const& start, GenomeLocus const& end, bool overlap = false )
220  {
221  if( start.chromosome != end.chromosome ) {
222  throw std::invalid_argument(
223  "Cannot use two GenomeLocus instances with different chromosomes ( start == \"" +
224  start.chromosome + "\", end == \"" + end.chromosome + "\") as an entry in a "
225  "GenomeRegionList."
226  );
227  }
228  add( start.chromosome, start.position, end.position, overlap );
229  }
230 
231  // -------------------------------------------
232  // Add Region
233  // -------------------------------------------
234 
242  void add( GenomeRegion const& region, bool overlap = false )
243  {
244  add( region.chromosome, region.start, region.end, overlap );
245  }
246 
247  // -------------------------------------------
248  // Add Region List
249  // -------------------------------------------
250 
258  void add( GenomeRegionList const& other, bool overlap = false )
259  {
260  for( auto const& chr : other.regions_ ) {
261  for( auto const& interval : chr.second ) {
262  add( chr.first, interval.low(), interval.high(), overlap );
263  }
264  }
265  }
266 
267  // -------------------------------------------
268  // Clear
269  // -------------------------------------------
270 
274  void clear()
275  {
276  regions_.clear();
277  }
278 
282  void clear( std::string const& chromosome )
283  {
284  if( regions_.count( chromosome ) == 0 ) {
285  throw std::invalid_argument(
286  "Chromosome name \"" + chromosome + "\" not found in GenomeRegionList"
287  );
288  }
289  regions_.erase( chromosome );
290  }
291 
292  // -------------------------------------------------------------------------
293  // Locus Queries
294  // -------------------------------------------------------------------------
295 
300  bool is_covered( std::string const& chromosome, numerical_type position ) const
301  {
302  // Using find(), so we only have to search in the map once, for speed.
303  auto const it = regions_.find( chromosome );
304  if( it == regions_.end() ) {
305  return false;
306  }
307  auto const& chrom_tree = it->second;
308 
309  // If the chromosome in our interval tree contains the 0 interval, we consider that
310  // as having the whole chromosome covered.
311  if( chrom_tree.overlap_find( 0 ) != chrom_tree.end() ) {
312  return true;
313  }
314 
315  // If the above is not the case, check the actual position.
316  return chrom_tree.overlap_find( position ) != chrom_tree.end();
317  }
318 
324  bool is_covered( std::string const& chromosome ) const
325  {
326  auto const it = regions_.find( chromosome );
327  if( it == regions_.end() ) {
328  return false;
329  }
330  auto const& chrom_tree = it->second;
331  return chrom_tree.overlap_find( 0 ) != chrom_tree.end();
332  }
333 
347  size_t cover_count(
348  std::string const& chromosome,
349  numerical_type position,
350  bool whole_chromosome = false
351  ) const {
352  // Using find(), so we only have to search in the map once, for speed.
353  auto const it = regions_.find( chromosome );
354  if( it == regions_.end() ) {
355  return 0;
356  }
357  auto const& chrom_tree = it->second;
358 
359  // Count overlaps.
360  size_t result = 0;
361  chrom_tree.overlap_find_all( position, [&]( const_iterator it ){
362  (void) it;
363  assert( it->interval().within( position ));
364  ++result;
365  return true;
366  });
367 
368  // Add whole chromosome if needed.
369  if( whole_chromosome && ( chrom_tree.overlap_find( 0 ) != chrom_tree.end() )) {
370  ++result;
371  }
372 
373  return result;
374  }
375 
376  // Not used at the moment, as we have no access to the end iterator to check for a valid find.
377 
378  // /* *
379  // * @brief Return an iterator to the GenomeRegion that covers the given position on the
380  // * chromosome, or an end iterator if there is no region that covers the position.
381  // *
382  // * Throws an exception if the chromosome is not in the list at all.
383  // */
384  // const_iterator find( std::string const& chromosome, size_t position ) const
385  // {
386  // // Using find(), so we only have to search in the map once, for speed.
387  // auto const it = regions_.find( chromosome );
388  // if( it == regions_.end() ) {
389  // throw std::invalid_argument(
390  // "GenomeRegionList does not contain chromosome \"" + chromosome + "\""
391  // );
392  // }
393  // auto const& chrom_tree = it->second;
394  // return chrom_tree.overlap_find( position );
395  // }
396 
397  // -------------------------------------------------------------------------
398  // Chromosome Accessors
399  // -------------------------------------------------------------------------
400 
404  bool empty() const
405  {
406  return regions_.empty();
407  }
408 
412  size_t chromosome_count() const
413  {
414  return regions_.size();
415  }
416 
420  std::vector<std::string> chromosome_names() const
421  {
422  std::vector<std::string> result;
423  for( auto const& p : regions_ ) {
424  result.push_back( p.first );
425  }
426  return result;
427  }
428 
432  bool has_chromosome( std::string const& chromosome ) const
433  {
434  return regions_.count( chromosome ) > 0;
435  }
436 
441  tree_type const& chromosome_regions( std::string const& chromosome ) const
442  {
443  return regions_.at( chromosome );
444  }
445 
455  tree_type& chromosome_regions( std::string const& chromosome )
456  {
457  return regions_.at( chromosome );
458  }
459 
463  size_t region_count( std::string const& chromosome ) const
464  {
465  if( regions_.count( chromosome ) == 0 ) {
466  throw std::invalid_argument(
467  "Chromosome name \"" + chromosome + "\" not found in GenomeRegionList"
468  );
469  }
470  return regions_.at( chromosome ).size();
471  }
472 
476  size_t total_region_count() const
477  {
478  size_t cnt = 0;
479  for( auto const& reg : regions_ ) {
480  cnt += reg.second.size();
481  }
482  return cnt;
483  }
484 
496  std::map<std::string, tree_type> const& chromosome_map() const
497  {
498  return regions_;
499  }
500 
504  std::map<std::string, tree_type>& chromosome_map()
505  {
506  return regions_;
507  }
508 
509  // -------------------------------------------------------------------------
510  // Iterator
511  // -------------------------------------------------------------------------
512 
513  // iterator begin()
514  // {
515  // return regions_.begin();
516  // }
517 
518  // const_iterator begin() const
519  // {
520  // return regions_.begin();
521  // }
522 
523  // iterator end()
524  // {
525  // return regions_.end();
526  // }
527 
528  // const_iterator end() const
529  // {
530  // return regions_.end();
531  // }
532 
533  // -------------------------------------------------------------------------
534  // Settings
535  // -------------------------------------------------------------------------
536 
537  // bool allow_overlap() const
538  // {
539  // return allow_overlap_;
540  // }
541  //
542  // self_type& allow_overlap( bool value )
543  // {
544  // when set this way, it will only affect newly added regions...
545  // better in constructor and const?
546  // allow_overlap_ = value;
547  // return *this;
548  // }
549 
550  // -------------------------------------------------------------------------
551  // Data Members
552  // -------------------------------------------------------------------------
553 
554 private:
555 
556  std::map<std::string, tree_type> regions_;
557 
558  // bool allow_overlap_ = true;
559 
560 };
561 
562 } // namespace population
563 } // namespace genesis
564 
565 #endif // include guard
genesis::population::GenomeRegionList::const_iterator
typename tree_type::const_iterator const_iterator
Definition: genome_region_list.hpp:112
genesis::population::GenomeRegionList::chromosome_names
std::vector< std::string > chromosome_names() const
Get a list of all stored chromosome names.
Definition: genome_region_list.hpp:420
genesis::population::GenomeRegionList::chromosome_map
std::map< std::string, tree_type > const & chromosome_map() const
Access the underlying container directly.
Definition: genome_region_list.hpp:496
genesis::population::GenomeRegionList::clear
void clear(std::string const &chromosome)
Remove the regions of the specified chromosome.
Definition: genome_region_list.hpp:282
genesis::population::GenomeRegionList::region_count
size_t region_count(std::string const &chromosome) const
Return the number of regions stored for the specified chromosome.
Definition: genome_region_list.hpp:463
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
genesis::population::GenomeRegionList::empty
bool empty() const
Return whether there are chromosomes with regions stored.
Definition: genome_region_list.hpp:404
genesis::population::GenomeLocus::position
size_t position
Definition: genome_locus.hpp:67
genome_locus.hpp
genesis::population::GenomeRegionList::add
void add(GenomeRegionList const &other, bool overlap=false)
Add a complete GenomeRegionList to this list.
Definition: genome_region_list.hpp:258
genesis::population::GenomeRegionList::chromosome_count
size_t chromosome_count() const
Return the number of chromosomes for which there are regions stored.
Definition: genome_region_list.hpp:412
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:64
genesis::population::GenomeRegionList::has_chromosome
bool has_chromosome(std::string const &chromosome) const
Return whether a chromosome is stored.
Definition: genome_region_list.hpp:432
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:75
genesis::population::GenomeLocus::chromosome
std::string chromosome
Definition: genome_locus.hpp:66
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:74
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genome_region.hpp
genesis::population::GenomeRegionList::data_type
EmptyGenomeData data_type
Definition: genome_region_list.hpp:104
genesis::population::GenomeRegionList::cover_count
size_t cover_count(std::string const &chromosome, numerical_type position, bool whole_chromosome=false) const
Retun the number of regions (intervals) that overlap with a given position on a chromosome.
Definition: genome_region_list.hpp:347
genesis::population::GenomeRegionList::is_covered
bool is_covered(std::string const &chromosome) const
Return whether a whole chromosome is covered.
Definition: genome_region_list.hpp:324
genesis::population::GenomeRegionList::chromosome_map
std::map< std::string, tree_type > & chromosome_map()
Access the underlying container directly.
Definition: genome_region_list.hpp:504
genesis::utils::IntervalClosed
Helper type to define a closed [] Interval.
Definition: interval.hpp:208
interval_tree.hpp
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome, numerical_type start, numerical_type end, bool overlap=false)
Add a region to the list, given its chromosome, and start and end positions.
Definition: genome_region_list.hpp:154
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeRegionList::is_covered
bool is_covered(std::string const &chromosome, numerical_type position) const
Return whether a given position on a chromosome is part of any of the regions (intervals) that are st...
Definition: genome_region_list.hpp:300
genesis::population::GenomeRegionList::operator=
GenomeRegionList & operator=(GenomeRegionList const &)=default
genesis::population::GenomeRegionList::add
void add(GenomeLocus const &start, GenomeLocus const &end, bool overlap=false)
Add an interval between two GenomeLoci on the same chromosome.
Definition: genome_region_list.hpp:219
genesis::population::GenomeRegionList::chromosome_regions
tree_type const & chromosome_regions(std::string const &chromosome) const
For a given chromosome, return the IntervalTree that stores its regions.
Definition: genome_region_list.hpp:441
genesis::utils::IntervalTree::iterator
IntervalTreeIterator< node_type, false > iterator
Definition: interval_tree.hpp:96
genesis::population::GenomeRegionList::add
void add(GenomeLocus const &locus, bool overlap=false)
Add a single GenomeLocus, that is, an interval covering one position on a chromosome.
Definition: genome_region_list.hpp:209
genesis::population::EmptyGenomeData
Helper struct to define a default empty data for the classes GenomeLocus, GenomeRegion,...
Definition: genome_region_list.hpp:56
genesis::population::GenomeRegionList::GenomeRegionList
GenomeRegionList()=default
genesis::utils::IntervalTree::const_iterator
IntervalTreeIterator< node_type, true > const_iterator
Definition: interval_tree.hpp:97
genesis::population::GenomeRegionList::add
void add(GenomeRegion const &region, bool overlap=false)
Add a GenomeRegion to the list.
Definition: genome_region_list.hpp:242
genesis::population::GenomeRegionList::chromosome_regions
tree_type & chromosome_regions(std::string const &chromosome)
For a given chromosome, return the IntervalTree that stores its regions.
Definition: genome_region_list.hpp:455
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:70
genesis::population::GenomeRegionList::clear
void clear()
Remove all stored regions from all chromosomes.
Definition: genome_region_list.hpp:274
genesis::population::GenomeRegionList::~GenomeRegionList
~GenomeRegionList()=default
genesis::population::GenomeRegionList::numerical_type
size_t numerical_type
Definition: genome_region_list.hpp:105
genesis::utils::IntervalTree
Interval tree that enables storing and querying intervals, each containing some data.
Definition: fwd.hpp:45
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:76
genesis::population::GenomeRegionList::total_region_count
size_t total_region_count() const
Return the number of regions stored in total, across all chromosomes.
Definition: genome_region_list.hpp:476
genesis::population::GenomeRegionList::iterator
typename tree_type::iterator iterator
Definition: genome_region_list.hpp:111