A library for working with phylogenetic and population genetic data.
v0.32.0
genome_locus_set.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_GENOME_LOCUS_SET_H_
2 #define GENESIS_POPULATION_GENOME_LOCUS_SET_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
34 #include <cassert>
35 #include <cstdint>
36 #include <limits>
37 #include <stdexcept>
38 #include <string>
39 #include <unordered_map>
40 #include <vector>
41 
47 
48 namespace genesis {
49 namespace population {
50 
51 // =================================================================================================
52 // Genome Locus Set
53 // =================================================================================================
54 
76 {
77 public:
78 
79  // -------------------------------------------------------------------------
80  // Typedefs and Enums
81  // -------------------------------------------------------------------------
82 
83  using key_type = std::string;
85  using value_type = std::pair<std::string const, utils::Bitvector>;
86 
88  using const_reference = value_type const&;
89 
90  using iterator = typename std::unordered_map<std::string, utils::Bitvector>::iterator;
91  using const_iterator = typename std::unordered_map<std::string, utils::Bitvector>::const_iterator;
92 
96  static const size_t npos = std::numeric_limits<size_t>::max();
97  static_assert(
99  "Differing definitions of GenomeLocusSet::npos and Bitvector::npos"
100  );
101 
102  // -------------------------------------------------------------------------
103  // Constructors and Rule of Five
104  // -------------------------------------------------------------------------
105 
106  GenomeLocusSet() = default;
107  ~GenomeLocusSet() = default;
108 
109  GenomeLocusSet( GenomeLocusSet const& ) = default;
110  GenomeLocusSet( GenomeLocusSet&& ) = default;
111 
112  GenomeLocusSet& operator= ( GenomeLocusSet const& ) = default;
113  GenomeLocusSet& operator= ( GenomeLocusSet&& ) = default;
114 
115  // -------------------------------------------------------------------------
116  // Modifiers
117  // -------------------------------------------------------------------------
118 
119  // -------------------------------------------
120  // Add Manually
121  // -------------------------------------------
122 
127  void add( std::string const& chromosome )
128  {
129  // We use the special value 0 to denote that we want the whole chromosome.
130  add( chromosome, 0, 0 );
131  }
132 
136  void add( std::string const& chromosome, size_t position )
137  {
138  add( chromosome, position, position );
139  }
140 
150  void add(
151  std::string const& chromosome,
152  size_t start,
153  size_t end
154  );
155 
156  // -------------------------------------------
157  // Add Locus
158  // -------------------------------------------
159 
163  void add( GenomeLocus const& locus )
164  {
165  add( locus.chromosome, locus.position, locus.position );
166  }
167 
171  void add( GenomeLocus const& start, GenomeLocus const& end )
172  {
173  if( start.chromosome != end.chromosome ) {
174  throw std::invalid_argument(
175  "Cannot use two GenomeLocus instances with different chromosomes ( start == \"" +
176  start.chromosome + "\", end == \"" + end.chromosome + "\") as an entry in a "
177  "GenomeLocusSet."
178  );
179  }
180  add( start.chromosome, start.position, end.position );
181  }
182 
183  // -------------------------------------------
184  // Add Region
185  // -------------------------------------------
186 
192  void add( GenomeRegion const& region )
193  {
194  add( region.chromosome, region.start, region.end );
195  }
196 
197  // -------------------------------------------
198  // Add Region List
199  // -------------------------------------------
200 
204  void add( GenomeRegionList const& list )
205  {
206  for( auto const& chr : list.chromosome_map() ) {
207  for( auto const& interval : chr.second ) {
208  add( chr.first, interval.low(), interval.high() );
209  }
210  }
211  }
212 
213  // -------------------------------------------
214  // Add Bitvector
215  // -------------------------------------------
216 
223  void add( std::string const& chromosome, utils::Bitvector const& values );
224 
228  void add( std::string const& chromosome, utils::Bitvector&& values );
229 
230  // -------------------------------------------
231  // Clear
232  // -------------------------------------------
233 
237  void clear()
238  {
239  locus_map_.clear();
240  }
241 
245  void clear( std::string const& chromosome )
246  {
247  if( locus_map_.count( chromosome ) == 0 ) {
248  throw std::invalid_argument(
249  "Chromosome name \"" + chromosome + "\" not found in GenomeLocusSet"
250  );
251  }
252  locus_map_.erase( chromosome );
253  }
254 
255  // -------------------------------------------------------------------------
256  // Operations
257  // -------------------------------------------------------------------------
258 
264  void set_intersect( GenomeLocusSet const& rhs );
265 
269  void set_union( GenomeLocusSet const& rhs );
270 
279  void invert( sequence::SequenceDict const& sequence_dict );
280 
281  // -------------------------------------------------------------------------
282  // Locus Covered
283  // -------------------------------------------------------------------------
284 
296  static bool is_covered( utils::Bitvector const& bitvector, size_t position )
297  {
298  // Boundary check.
299  if( bitvector.empty() ) {
300  throw std::invalid_argument(
301  "GenomeLocusSet::is_covered( Bitvector const&, size_t ) called with empty Bitvector"
302  );
303  }
304  // if( position == 0 ) {
305  // throw std::invalid_argument(
306  // "GenomeLocusSet::is_covered( Bitvector const&, size_t ) called with position==0"
307  // );
308  // }
309 
310  // If the chromosome has the 0th bit set, the whole chromosome is covered.
311  if( bitvector.get( 0 ) ) {
312  return true;
313  }
314 
315  // If the above is not the case, check the actual position.
316  // If the position is outside of the bitvector, it is not covered, obviously.
317  if( position >= bitvector.size() ) {
318  return false;
319  }
320  return bitvector.get( position );
321  }
322 
332  static bool is_covered( const_iterator const& it, size_t position )
333  {
334  return is_covered( it->second, position );
335  }
336 
342  bool is_covered( std::string const& chromosome, size_t position ) const
343  {
344  // Using find(), so we only have to search in the map once, for speed.
345  auto const it = locus_map_.find( chromosome );
346  if( it == locus_map_.end() ) {
347  return false;
348  }
349  auto const& bv = it->second;
350  assert( ! bv.empty() );
351  return is_covered( bv, position );
352  }
353 
360  bool is_covered( std::string const& chromosome ) const
361  {
362  auto const it = locus_map_.find( chromosome );
363  if( it == locus_map_.end() ) {
364  return false;
365  }
366  auto const& bv = it->second;
367  assert( ! bv.empty() );
368  return bv.get( 0 );
369  }
370 
371  // -------------------------------------------------------------------------
372  // Any Covered Locus
373  // -------------------------------------------------------------------------
374 
378  bool any_covered( std::string const& chromosome ) const
379  {
380  auto const it = locus_map_.find( chromosome );
381  if( it == locus_map_.end() ) {
382  return false;
383  }
384  auto const& bv = it->second;
385  assert( ! bv.empty() );
386 
387  // We do not need to to an extra check for position 0 here.
388  // If it is true, then so will be the result.
389  return bv.any_set();
390  }
391 
392  // -------------------------------------------------------------------------
393  // Next Covered Locus
394  // -------------------------------------------------------------------------
395 
403  static size_t next_covered( utils::Bitvector const& bitvector, size_t start_position )
404  {
405  // Boundary check.
406  if( bitvector.empty() ) {
407  throw std::invalid_argument(
408  "GenomeLocusSet::next_covered() called with empty Bitvector"
409  );
410  }
411  if( start_position == 0 ) {
412  throw std::invalid_argument(
413  "GenomeLocusSet::next_covered() called with start_position==0"
414  );
415  }
416 
417  // If the chromosome has the 0th bit set, the whole chromosome is covered,
418  // so that the start_position we are at is also covered.
419  if( bitvector.get( 0 ) ) {
420  return start_position;
421  }
422 
423  // If the above is not the case, check the actual start_position.
424  // If the start_position is outside of the bitvector, it is not covered, obviously.
425  return bitvector.find_next_set( start_position );
426  }
427 
435  static size_t next_covered( const_iterator const& it, size_t start_position )
436  {
437  return next_covered( it->second, start_position );
438  }
439 
449  size_t next_covered( std::string const& chromosome, size_t start_position ) const
450  {
451  auto const it = locus_map_.find( chromosome );
452  if( it == locus_map_.end() ) {
453  return GenomeLocusSet::npos;
454  }
455  auto const& bv = it->second;
456  assert( ! bv.empty() );
457  return next_covered( bv, start_position );
458  }
459 
460  // -------------------------------------------------------------------------
461  // Chromosome Iterators
462  // -------------------------------------------------------------------------
463 
468  {
469  return locus_map_.begin();
470  }
471 
476  {
477  return locus_map_.end();
478  }
479 
480  // -------------------------------------------------------------------------
481  // Chromosome Accessors
482  // -------------------------------------------------------------------------
483 
487  bool empty() const
488  {
489  return locus_map_.empty();
490  }
491 
495  size_t chromosome_count() const
496  {
497  return locus_map_.size();
498  }
499 
503  std::vector<std::string> chromosome_names() const
504  {
505  std::vector<std::string> result;
506  for( auto const& p : locus_map_ ) {
507  result.push_back( p.first );
508  }
509  return result;
510  }
511 
515  bool has_chromosome( std::string const& chromosome ) const
516  {
517  return locus_map_.count( chromosome ) > 0;
518  }
519 
525  const_iterator find( std::string const& chromosome )
526  {
527  return locus_map_.find( chromosome );
528  }
529 
534  utils::Bitvector const& chromosome_positions( std::string const& chromosome ) const
535  {
536  return locus_map_.at( chromosome );
537  }
538 
547  utils::Bitvector& chromosome_positions( std::string const& chromosome )
548  {
549  return locus_map_.at( chromosome );
550  }
551 
552  // -------------------------------------------------------------------------
553  // Data Members
554  // -------------------------------------------------------------------------
555 
556 private:
557 
558  // Map from chromosome names to bitvectors representing which positions are in (true)
559  // and out (false). Note that position 0 is special; if set, it means that we consider
560  // the whole chromosome as covered.
561  std::unordered_map<std::string, utils::Bitvector> locus_map_;
562 
563 };
564 
565 } // namespace population
566 } // namespace genesis
567 
568 #endif // include guard
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome, size_t position)
Add a single locus (position, coordinate) to the list.
Definition: genome_locus_set.hpp:136
genesis::population::GenomeLocusSet::add
void add(GenomeRegion const &region)
Add a GenomeRegion to the list.
Definition: genome_locus_set.hpp:192
genesis::population::GenomeRegionList::chromosome_map
std::map< std::string, tree_type > const & chromosome_map() const
Access the underlying container directly.
Definition: genome_region_list.hpp:496
genesis::population::GenomeLocusSet::chromosome_positions
utils::Bitvector const & chromosome_positions(std::string const &chromosome) const
For a given chromosome, return the Bitvector that stores its positions.
Definition: genome_locus_set.hpp:534
genesis::population::GenomeLocusSet::is_covered
static bool is_covered(utils::Bitvector const &bitvector, size_t position)
Return whether a given position on the provided bitvector is covered.
Definition: genome_locus_set.hpp:296
genesis::population::GenomeLocusSet::set_union
void set_union(GenomeLocusSet const &rhs)
Compute the union with another GenomeLocusSet rhs.
Definition: genome_locus_set.cpp:219
genesis::population::GenomeLocusSet::clear
void clear()
Remove all stored regions from all chromosomes.
Definition: genome_locus_set.hpp:237
genesis::population::GenomeLocusSet::chromosome_names
std::vector< std::string > chromosome_names() const
Get a list of all stored chromosome names.
Definition: genome_locus_set.hpp:503
genesis::utils::Bitvector::get
bool get(size_t index) const
Return the value of a single bit, with boundary check.
Definition: bitvector.hpp:167
genesis::population::GenomeLocusSet::is_covered
bool is_covered(std::string const &chromosome) const
Return whether a whole chromosome is covered.
Definition: genome_locus_set.hpp:360
genesis::population::GenomeLocusSet::end
const_iterator end() const
Return an iterator to the end of the map of chromosome names to Bitvectors.
Definition: genome_locus_set.hpp:475
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::population::GenomeLocusSet::value_type
std::pair< std::string const, utils::Bitvector > value_type
Definition: genome_locus_set.hpp:85
genesis::population::GenomeLocus::position
size_t position
Definition: genome_locus.hpp:67
genesis::population::GenomeLocusSet::empty
bool empty() const
Return whether there are chromosomes with positions stored.
Definition: genome_locus_set.hpp:487
genesis::utils::Bitvector::find_next_set
size_t find_next_set(size_t start) const
Return the index of the next position in the Bitvector that is set.
Definition: bitvector.cpp:410
genome_locus.hpp
genesis::population::GenomeLocusSet::has_chromosome
bool has_chromosome(std::string const &chromosome) const
Return whether a chromosome is stored.
Definition: genome_locus_set.hpp:515
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::population::GenomeLocusSet::chromosome_count
size_t chromosome_count() const
Return the number of chromosomes for which there are positions stored.
Definition: genome_locus_set.hpp:495
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:64
genesis::population::GenomeLocusSet::iterator
typename std::unordered_map< std::string, utils::Bitvector >::iterator iterator
Definition: genome_locus_set.hpp:90
genesis::population::GenomeLocusSet::next_covered
static size_t next_covered(const_iterator const &it, size_t start_position)
Return the next position (including the start_position) that is covered.
Definition: genome_locus_set.hpp:435
genesis::population::GenomeLocusSet::add
void add(GenomeRegionList const &list)
Add a complete GenomeRegionList to this list.
Definition: genome_locus_set.hpp:204
genesis::population::GenomeLocusSet::begin
const_iterator begin() const
Return an iterator to the beginning of the map of chromosome names to Bitvectors.
Definition: genome_locus_set.hpp:467
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::GenomeLocusSet::GenomeLocusSet
GenomeLocusSet()=default
genesis::population::GenomeRegion::start
size_t start
Definition: genome_region.hpp:75
genesis::population::GenomeLocusSet::const_reference
value_type const & const_reference
Definition: genome_locus_set.hpp:88
genesis::population::GenomeLocus::chromosome
std::string chromosome
Definition: genome_locus.hpp:66
genesis::population::GenomeRegion::chromosome
std::string chromosome
Definition: genome_region.hpp:74
genesis::population::GenomeLocusSet::clear
void clear(std::string const &chromosome)
Remove the regions of the specified chromosome.
Definition: genome_locus_set.hpp:245
genesis::population::GenomeLocusSet::npos
static const size_t npos
Position value to indicate that next_covered did not find any covered position.
Definition: genome_locus_set.hpp:96
genome_region.hpp
genesis::population::GenomeLocusSet::add
void add(GenomeLocus const &start, GenomeLocus const &end)
Add an interval between two GenomeLoci on the same chromosome.
Definition: genome_locus_set.hpp:171
genesis::population::GenomeLocusSet::add
void add(GenomeLocus const &locus)
Add a single GenomeLocus, that is, an interval covering one position on a chromosome.
Definition: genome_locus_set.hpp:163
genesis::population::GenomeLocusSet::invert
void invert(sequence::SequenceDict const &sequence_dict)
Invert all chromosome regions.
Definition: genome_locus_set.cpp:262
genome_region_list.hpp
sequence_dict.hpp
genesis::population::GenomeLocusSet::reference
value_type & reference
Definition: genome_locus_set.hpp:87
genesis::population::GenomeLocusSet::is_covered
bool is_covered(std::string const &chromosome, size_t position) const
Return whether a given position on a chromosome is part of any of the regions stored.
Definition: genome_locus_set.hpp:342
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeLocusSet::next_covered
static size_t next_covered(utils::Bitvector const &bitvector, size_t start_position)
Return the next position (including the start_position) that is covered.
Definition: genome_locus_set.hpp:403
genesis::utils::Bitvector::empty
bool empty() const
Return whether the Bitvector is empty, that is, has size() == 0.
Definition: bitvector.hpp:296
genesis::population::GenomeLocusSet::next_covered
size_t next_covered(std::string const &chromosome, size_t start_position) const
Return the next position (including the start_position) that is covered.
Definition: genome_locus_set.hpp:449
genesis::population::GenomeLocusSet::set_intersect
void set_intersect(GenomeLocusSet const &rhs)
Compute the intersection with another GenomeLocusSet rhs.
Definition: genome_locus_set.cpp:139
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:70
genesis::population::GenomeLocusSet::find
const_iterator find(std::string const &chromosome)
Find a chromosome in the map.
Definition: genome_locus_set.hpp:525
genesis::population::GenomeLocusSet::key_type
std::string key_type
Definition: genome_locus_set.hpp:83
genesis::population::GenomeLocusSet::chromosome_positions
utils::Bitvector & chromosome_positions(std::string const &chromosome)
For a given chromosome, return the Bitvector that stores its positions.
Definition: genome_locus_set.hpp:547
genesis::population::GenomeLocusSet::operator=
GenomeLocusSet & operator=(GenomeLocusSet const &)=default
bitvector.hpp
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127
genesis::utils::Bitvector::size
size_t size() const
Return the size (number of bits) of this Bitvector.
Definition: bitvector.hpp:304
genesis::utils::Bitvector::npos
static const size_t npos
Value to indicate that find_next_set() did not find any set bits.
Definition: bitvector.hpp:64
genesis::population::GenomeRegion::end
size_t end
Definition: genome_region.hpp:76
genesis::population::GenomeLocusSet::~GenomeLocusSet
~GenomeLocusSet()=default
genesis::population::GenomeLocusSet::any_covered
bool any_covered(std::string const &chromosome) const
Return if the given chromosome has any loci covered.
Definition: genome_locus_set.hpp:378
genesis::population::GenomeLocusSet::is_covered
static bool is_covered(const_iterator const &it, size_t position)
Return whether a given position on the provided iterator is covered.
Definition: genome_locus_set.hpp:332
genesis::population::GenomeLocusSet::const_iterator
typename std::unordered_map< std::string, utils::Bitvector >::const_iterator const_iterator
Definition: genome_locus_set.hpp:91
genesis::utils::Bitvector
Definition: bitvector.hpp:50