A library for working with phylogenetic and population genetic data.
v0.32.0
genome_locus_set.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 #include <unordered_set>
38 
39 namespace genesis {
40 namespace population {
41 
42 // =================================================================================================
43 // Add
44 // =================================================================================================
45 
47  std::string const& chromosome,
48  size_t start,
49  size_t end
50 ) {
51  // Check chromosome.
52  if( chromosome.empty() ) {
53  throw std::invalid_argument(
54  "Cannot add region to GenomeLocusSet with empty chromosome name, "
55  "as this denotes an invalid chromosome."
56  );
57  }
58 
59  // Check positions.
60  // The start and end are also checked in the interval tree, but let's do it here
61  // so that the error message is nicer in case they are wrong.
62  if( start > end ) {
63  throw std::invalid_argument(
64  "Cannot add region to GenomeLocusSet with start == " +
65  std::to_string( start ) + " > end == " + std::to_string( end )
66  );
67  }
68  if(( start == 0 ) ^ ( end == 0 )) {
69  throw std::invalid_argument(
70  "Cannot add region to GenomeLocusSet with either start == 0 or end == 0, "
71  "but not both, as we use 1-base indexing, with both being 0 being interpreted "
72  "as the special case of denoting the whole chromosome. "
73  "Hence either both start and end have to be 0, or neither."
74  );
75  }
76  assert( start <= end );
77 
78  // Create and extend the bitvector as needed. For now, we double its size,
79  // for amortization in the long run. Might find a better strategy later.
80  auto& bv = locus_map_[ chromosome ];
81  if( bv.empty() ) {
82  bv = utils::Bitvector( end + 1 );
83  } else if( bv.size() < end + 1 ) {
84  if( bv.size() * 2 < end + 1 ) {
85  bv = utils::Bitvector( end + 1, bv );
86  } else {
87  bv = utils::Bitvector( bv.size() * 2, bv );
88  }
89  }
90  assert( bv.size() >= end + 1 );
91 
92  // Now set all bits in between the two positions, inclusive.
93  bv.set( start, end + 1 );
94  // for( size_t i = start; i <= end; ++i ) {
95  // bv.set( i );
96  // }
97 }
98 
99 void GenomeLocusSet::add( std::string const& chromosome, utils::Bitvector const& values )
100 {
101  add( chromosome, utils::Bitvector( values ));
102 }
103 
104 void GenomeLocusSet::add( std::string const& chromosome, utils::Bitvector&& values )
105 {
106  // Checks.
107  if( chromosome.empty() ) {
108  throw std::invalid_argument(
109  "Cannot add region to GenomeLocusSet with empty chromosome name, "
110  "as this denotes an invalid chromosome."
111  );
112  }
113  if( locus_map_.count( chromosome ) > 0 ) {
114  throw std::invalid_argument(
115  "Cannot add region via chromosome and Bitvector, as chromosome \"" + chromosome +
116  "\" is already present in the GenomeLocusSet."
117  );
118  }
119  if( values.empty() ) {
120  throw std::invalid_argument(
121  "Cannot add region via chromosome and Bitvector, as given Bitvector is empty."
122  );
123  }
124  assert( values.size() > 0 );
125  if( values.size() == 1 && !values.get(0) ) {
126  throw std::invalid_argument(
127  "Cannot add region via chromosome and Bitvector, as given Bitvector has [0]==false."
128  );
129  }
130 
131  // Set.
132  locus_map_[ chromosome ] = values;
133 }
134 
135 // =================================================================================================
136 // Operations
137 // =================================================================================================
138 
140 {
141  using namespace genesis::utils;
142 
143  // Shorthand for clarity and for ease of refactoring
144  // if we need to make this a free function later.
145  auto& lhs = *this;
146 
147  // Make a list of all chromosomes in the current list.
148  // The ones that are not in rhs or end up empty will be deleted later.
149  std::unordered_set<std::string> chrs_to_delete;
150  for( auto const& chr : lhs.locus_map_ ) {
151  assert( chrs_to_delete.count( chr.first ) == 0 );
152  chrs_to_delete.insert( chr.first );
153  }
154 
155  // Go through all chromosomes of the rhs.
156  for( auto const& chr : rhs.locus_map_ ) {
157  // Skip chromosomes that are not in the current list. The intersection of a chromosome
158  // that is only in the rhs but not in lhs is empty anyway, so nothing to do.
159  if( ! chrs_to_delete.count( chr.first ) ) {
160  assert( ! lhs.has_chromosome( chr.first ));
161  continue;
162  }
163  assert( lhs.has_chromosome( chr.first ));
164  assert( rhs.has_chromosome( chr.first ));
165 
166  // Whenever a bitvector is set for a chromosome, we give it at least size 1,
167  // so we can at least always access the bit 0. Assert this.
168  assert( lhs.locus_map_.at( chr.first ).size() > 0 );
169  assert( rhs.locus_map_.at( chr.first ).size() > 0 );
170 
171  // Get some shorthands, for readability, and a bit for speed as well.
172  auto& lhs_bits = lhs.locus_map_.at( chr.first );
173  auto const& rhs_bits = rhs.locus_map_.at( chr.first );
174  auto const lhs_bit_0 = lhs_bits.get( 0 );
175  auto const rhs_bit_0 = rhs_bits.get( 0 );
176  assert( &rhs_bits == &chr.second );
177 
178  // We found a chromosome that is in both lists, let's process it.
179  if( lhs_bit_0 && rhs_bit_0 ) {
180  // If both have the full chromosome, we use this opportunity to shorten the vector.
181  // Make a new bitvector that just marks this chromosome.
182  lhs_bits = Bitvector( 1, true );
183  } else if( lhs_bit_0 && !rhs_bit_0 ) {
184  // lhs uses the whole chromosome, rhs not. The intersection of this is rhs.
185  lhs_bits = rhs_bits;
186  assert( lhs_bits.size() > 0 );
187  assert( !lhs_bits.get(0) );
188  } else if( !lhs_bit_0 && rhs_bit_0 ) {
189  // lhs does not use the whole chromosome, but rhs does.
190  // The intersection of this is just lhs again, so nothing to do here.
191  assert( lhs_bits.size() > 0 );
192  assert( !lhs_bits.get(0) );
193  } else {
194  assert( !lhs_bit_0 && !rhs_bit_0 );
195  // Actual intersection of the two vectors.
196  // We use the smaller one as our target size, hence `use_larger == false` here.
197  // Everything behind those positions will end up false anyway when intersecting.
198  lhs_bits = bitwise_and( lhs_bits, rhs_bits, false );
199  assert( lhs_bits.size() > 0 );
200  assert( !lhs_bits.get(0) );
201  }
202 
203  // If the result has any positions set, this is still a chromosome that we want to keep,
204  // so remove it from the to-delete list. If all its bits are 0, we have eliminated
205  // all positions from the filter, so we might as well delete the whole vector; in that
206  // case, we simply keept it in the to-delete list and then it gets removed below.
207  if( lhs_bits.count() > 0 ) {
208  chrs_to_delete.erase( chr.first );
209  }
210  }
211 
212  // Delete all chromosomes from lhs that were not also in rhs, or ended up empty.
213  for( auto const& chr : chrs_to_delete ) {
214  assert( lhs.has_chromosome( chr ));
215  lhs.locus_map_.erase( chr );
216  }
217 }
218 
220 {
221  // Remark on the name: `union` is a C++ keyword, so we cannot name the function just that...
222  using namespace genesis::utils;
223 
224  // Shorthand for clarity and for ease of refactoring
225  // if we need to make this a free function later.
226  auto& lhs = *this;
227 
228  // Go through all chromosomes of the rhs.
229  for( auto const& chr : rhs.locus_map_ ) {
230  // Shorthands. We cannot access the lhs positions yet, as they might not exist.
231  auto const& rhs_bits = rhs.locus_map_.at( chr.first );
232  assert( &rhs_bits == &chr.second );
233 
234  if( lhs.has_chromosome( chr.first )) {
235  assert( lhs.locus_map_.count( chr.first ) > 0 );
236  auto& lhs_bits = lhs.locus_map_.at( chr.first );
237  if( lhs_bits.get(0) || rhs_bits.get(0) ) {
238  // We check the special 0 bit case here, meaning that if either of the vectors
239  // has the bit set, we shorten the vector here, to save some memory.
240  lhs_bits = Bitvector( 1, true );
241  } else {
242  // Compute actual union of the two vectors.
243  // Here, we use `use_larger == true`, so that the longer vector is used,
244  // with all its bits that are not in the other one.
245  lhs_bits = bitwise_or( lhs_bits, rhs_bits, true );
246  }
247  } else {
248  // lhs does not have the chromosome, so we just copy. We here use the array
249  // operator to create the entry in the map first. We also again do a special case
250  // and shorten all-chromosome vectors here, while we are at it.
251  assert( lhs.locus_map_.count( chr.first ) == 0 );
252  auto& lhs_bits = lhs.locus_map_[ chr.first ];
253  if( rhs_bits.get(0) ) {
254  lhs_bits = Bitvector( 1, true );
255  } else {
256  lhs_bits = rhs_bits;
257  }
258  }
259  }
260 }
261 
262 void GenomeLocusSet::invert( sequence::SequenceDict const& sequence_dict )
263 {
264  using namespace genesis::utils;
265 
266  for( auto& chr_bv : locus_map_ ) {
267  // Basic check for the chromosome
268  if( ! sequence_dict.contains( chr_bv.first )) {
269  throw std::runtime_error(
270  "Cannot invert Genome Locus Set for chromosome \"" + chr_bv.first +
271  "\", as the given Sequence Dict does not contain an entry for that chromosome"
272  );
273  }
274 
275  // Get the sequence dict entry and check the lengths
276  auto const& seq_entry = sequence_dict.get( chr_bv.first );
277  assert( chr_bv.second.size() > 0 );
278  if( chr_bv.second.size() - 1 > seq_entry.size() ) {
279  throw std::runtime_error(
280  "Cannot invert Genome Locus Set for chromosome \"" + chr_bv.first +
281  "\", as the given Sequence Dict indicates its length to be " +
282  std::to_string( seq_entry.size() ) + ", instead of " +
283  std::to_string( chr_bv.second.size() - 1 )
284  );
285  }
286 
287  // Create an empty bitvector of the desired size, and fill it with the bits set.
288  // We make sure that it gets the larger size, which is the seq entry size (or equal).
289  // Again we need to reserve an additional bit for special position 0.
290  // If the "all" marker is set, the inversion is "nothing", so then we just use a single bit.
291  auto new_bv = Bitvector( 1 );
292  if( ! chr_bv.second.get(0) ) {
293  new_bv = Bitvector( seq_entry.size() + 1, chr_bv.second );
294  new_bv.negate();
295  new_bv.set( 0, false );
296  assert( new_bv.size() >= chr_bv.second.size() );
297  }
298 
299  // Finally update our data
300  locus_map_[ chr_bv.first ] = new_bv;
301  }
302 }
303 
304 } // namespace population
305 } // namespace genesis
genesis::population::GenomeLocusSet::set_union
void set_union(GenomeLocusSet const &rhs)
Compute the union with another GenomeLocusSet rhs.
Definition: genome_locus_set.cpp:219
genesis::population::GenomeLocusSet::end
const_iterator end() const
Return an iterator to the end of the map of chromosome names to Bitvectors.
Definition: genome_locus_set.hpp:475
genesis::sequence::SequenceDict::contains
bool contains(std::string const &name) const
Definition: sequence_dict.hpp:252
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::population::GenomeLocusSet::has_chromosome
bool has_chromosome(std::string const &chromosome) const
Return whether a chromosome is stored.
Definition: genome_locus_set.hpp:515
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
operators.hpp
genesis::population::GenomeLocusSet::invert
void invert(sequence::SequenceDict const &sequence_dict)
Invert all chromosome regions.
Definition: genome_locus_set.cpp:262
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genome_locus_set.hpp
genesis::utils::bitwise_or
Bitvector bitwise_or(Bitvector const &lhs, Bitvector const &rhs, bool use_larger)
Take the bitwise or of two Bitvectors of potentially different size.
Definition: utils/math/bitvector/operators.cpp:58
genesis::population::GenomeLocusSet::set_intersect
void set_intersect(GenomeLocusSet const &rhs)
Compute the intersection with another GenomeLocusSet rhs.
Definition: genome_locus_set.cpp:139
genesis::sequence::SequenceDict::get
Entry const & get(std::string const &name) const
Definition: sequence_dict.hpp:233
genesis::utils::bitwise_and
Bitvector bitwise_and(Bitvector const &lhs, Bitvector const &rhs, bool use_larger)
Take the bitwise and of two Bitvectors of potentially different size.
Definition: utils/math/bitvector/operators.cpp:45
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127
genesis::utils::Bitvector
Definition: bitvector.hpp:50