A library for working with phylogenetic and population genetic data.
v0.32.0
twobit_vector.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace utils {
40 
41 // ================================================================================================
42 // Typedefs and Constants
43 // ================================================================================================
44 
48 const TwobitVector::WordType TwobitVector::all_0_ = 0ul;
49 
53 const TwobitVector::WordType TwobitVector::all_1_ = (((1ul << 32) - 1) << 32) + ((1ul << 32) - 1);
54 
67 const TwobitVector::WordType TwobitVector::bit_mask_[ TwobitVector::kValuesPerWord ] =
68 {
69  3ul << 0, 3ul << 2, 3ul << 4, 3ul << 6, 3ul << 8, 3ul << 10, 3ul << 12, 3ul << 14,
70  3ul << 16, 3ul << 18, 3ul << 20, 3ul << 22, 3ul << 24, 3ul << 26, 3ul << 28, 3ul << 30,
71  3ul << 32, 3ul << 34, 3ul << 36, 3ul << 38, 3ul << 40, 3ul << 42, 3ul << 44, 3ul << 46,
72  3ul << 48, 3ul << 50, 3ul << 52, 3ul << 54, 3ul << 56, 3ul << 58, 3ul << 60, 3ul << 62
73 };
74 
91 const TwobitVector::WordType TwobitVector::ones_mask_[ TwobitVector::kValuesPerWord ] =
92 {
93  TwobitVector::all_0_, TwobitVector::all_1_ >> 62,
94  TwobitVector::all_1_ >> 60, TwobitVector::all_1_ >> 58,
95  TwobitVector::all_1_ >> 56, TwobitVector::all_1_ >> 54,
96  TwobitVector::all_1_ >> 52, TwobitVector::all_1_ >> 50,
97  TwobitVector::all_1_ >> 48, TwobitVector::all_1_ >> 46,
98  TwobitVector::all_1_ >> 44, TwobitVector::all_1_ >> 42,
99  TwobitVector::all_1_ >> 40, TwobitVector::all_1_ >> 38,
100  TwobitVector::all_1_ >> 36, TwobitVector::all_1_ >> 34,
101  TwobitVector::all_1_ >> 32, TwobitVector::all_1_ >> 30,
102  TwobitVector::all_1_ >> 28, TwobitVector::all_1_ >> 26,
103  TwobitVector::all_1_ >> 24, TwobitVector::all_1_ >> 22,
104  TwobitVector::all_1_ >> 20, TwobitVector::all_1_ >> 18,
105  TwobitVector::all_1_ >> 16, TwobitVector::all_1_ >> 14,
106  TwobitVector::all_1_ >> 12, TwobitVector::all_1_ >> 10,
107  TwobitVector::all_1_ >> 8, TwobitVector::all_1_ >> 6,
108  TwobitVector::all_1_ >> 4, TwobitVector::all_1_ >> 2
109 };
110 
111 // ================================================================================================
112 // Constructors and Rule of Five
113 // ================================================================================================
114 
119  : size_( size )
120  , data_( ( size / kValuesPerWord ) + ( size % kValuesPerWord == 0 ? 0 : 1 ), 0 )
121 {}
122 
123 // ================================================================================================
124 // Accessors
125 // ================================================================================================
126 
131 size_t TwobitVector::size() const
132 {
133  return size_;
134 }
135 
141 {
142  assert( ( size_ / kValuesPerWord ) + ( size_ % kValuesPerWord == 0 ? 0 : 1 ) == data_.size() );
143  return data_.size();
144 }
145 
150 {
151  if (index >= size_) {
152  throw std::out_of_range( "TwobitVector::get: Invalid index." );
153  }
154 
155  // Get the two-bit value at index, still at its original position in the word.
156  auto segment = data_[ index / kValuesPerWord ] & bit_mask_[ index % kValuesPerWord ];
157 
158  // Shift it to the right, so that we can cast it to a value type.
159  return static_cast< ValueType >( segment >> ( 2 * ( index % kValuesPerWord )));
160 }
161 
166 {
167  return get( index );
168 }
169 
176 TwobitVector::WordType const& TwobitVector::data_at( size_t index ) const
177 {
178  return data_.at( index );
179 }
180 
188 {
189  return data_.at( index );
190 }
191 
199 {
200  WordType result = static_cast< WordType >( size() );
201  for( auto s : data_ ) {
202  result ^= s;
203  }
204  return result;
205 }
206 
207 // ================================================================================================
208 // Operators
209 // ================================================================================================
210 
214 bool TwobitVector::operator == ( TwobitVector const& other ) const
215 {
216  if( size_ != other.size_ ) {
217  return false;
218  }
219  for( size_t i = 0; i < data_.size(); ++i ) {
220  if( data_[i] != other.data_[i] ) {
221  return false;
222  }
223  }
224  return true;
225 }
226 
230 bool TwobitVector::operator != ( TwobitVector const& other ) const
231 {
232  return !(*this == other);
233 }
234 
242 {
243  // Check if the size is correct.
244  if( ( size_ / kValuesPerWord ) + ( size_ % kValuesPerWord == 0 ? 0 : 1 ) != data_.size() ) {
245  LOG_INFO << "Sizes does not match.";
246  return false;
247  }
248 
249  // Check if the zero padding at the end is correct
250  // (only if we do have padding though).
251  if(
252  ( size_ % kValuesPerWord != 0 )
253  && (( data_.back() & ~ ones_mask_[ size_ % kValuesPerWord ] ) != 0ul )
254  ) {
255  LOG_INFO << "Invalid padding bits.";
256  return false;
257  }
258 
259  return true;
260 }
261 
262 // ================================================================================================
263 // Modifiers
264 // ================================================================================================
265 
269 void TwobitVector::set( size_t index, TwobitVector::ValueType value )
270 {
271  if( index >= size_ ) {
272  throw std::out_of_range( "TwobitVector::set: Invalid index." );
273  }
274 
275  // Shift the value to the correct position within the word.
276  auto const tmp = static_cast< WordType >( value ) << ( 2 * ( index % kValuesPerWord ));
277 
278  // Unset the bits at the position in the word, and reset them to the value.
279  // (Unfortunately, we are not operation on single bits, so a simple `and` or `or`
280  // does not work here. Maybe there are smarter ways, but this one works for now.)
281  data_[ index / kValuesPerWord ] &= ~ bit_mask_[ index % kValuesPerWord ];
282  data_[ index / kValuesPerWord ] |= tmp;
283 }
284 
291 {
292  if( index > size_ ) {
293  throw std::out_of_range( "TwobitVector::insert_at: Invalid index." );
294  }
295 
296  // Shorthands.
297  size_t const word_id = index / kValuesPerWord;
298  size_t const segm_id = index % kValuesPerWord;
299 
300  // If the last word is fully used, we need to add a new one.
301  if( size_ % kValuesPerWord == 0 ) {
302  data_.push_back( 0ul );
303  }
304 
305  // Shift all the data right of the insertion word by one.
306  for( size_t i = data_.size() - 1; i > word_id; --i ) {
307 
308  // Shift the data by one value. We do not loose anything, because the value that is
309  // shifted out of the word was already processed in a previous iteration of this
310  // loop (or is zero anyway in the first iteration).
311  data_[ i ] <<= 2;
312 
313  // Get the value of the next word (the one that will be shifted away in the next
314  // iteration of this loop), and move it to the other end of the word.
315  auto bleed = data_[ i - 1 ] >> ( sizeof( WordType ) * 8 - 2 );
316 
317  // Now add this to the current word, so that the bits that are zero (because of the
318  // previous shifting) are filled again with the correct value.
319  data_[ i ] |= bleed;
320  }
321 
322  // Get the values in the insertion word that are right of the insertion position.
323  auto const remainder = data_[ word_id ] & ( ~ ones_mask_[ segm_id ] );
324 
325  // Delete those values in the word.
326  data_[ word_id ] &= ones_mask_[ segm_id ];
327 
328  // Restore them, shifted by one position. Now we have room for the actual insertion.
329  data_[ word_id ] |= ( remainder << 2 );
330 
331  // Shift the insertion value to its position, store it in the word, and adjust the size.
332  auto const val_shifted = static_cast< WordType >( value ) << ( 2 * segm_id );
333  data_[ word_id ] |= val_shifted;
334  ++size_;
335 }
336 
342 void TwobitVector::remove_at( size_t index )
343 {
344  if( index >= size_ ) {
345  throw std::out_of_range( "TwobitVector::remove_at: Invalid index." );
346  }
347 
348  // Shorthands.
349  size_t const word_id = index / kValuesPerWord;
350  size_t const segm_id = index % kValuesPerWord;
351 
352  // If the position in the word is not the last:
353  if( segm_id < kValuesPerWord - 1 ) {
354 
355  // Get the part of the word that needs to be shifted.
356  auto remainder = data_[ word_id ] & ( ~ ones_mask_[ segm_id + 1 ] );
357 
358  // Delete this part.
359  data_[ word_id ] &= ones_mask_[ segm_id ];
360 
361  // Reset it with the shifted rest values.
362  data_[ word_id ] |= ( remainder >> 2 );
363 
364  // If it is the last position in the word, we do not need to shift a rest,
365  // but can just unset the last value.
366  } else {
367  data_[ word_id ] &= ones_mask_[ segm_id ];
368  }
369 
370  // If the word of the deletion is not the last, we need to shift values.
371  if( word_id < data_.size() - 1 ) {
372 
373  // Get the first value of the next word and store it as the last value in the
374  // word where we just deleted a value.
375  auto bleed = data_[ word_id + 1 ] << ( sizeof( WordType ) * 8 - 2 );
376  data_[ word_id ] |= bleed;
377 
378  // Move all values in the remaining words (except the last one) by one.
379  for( size_t i = word_id + 1; i < data_.size() - 1; ++i ) {
380  bleed = data_[ i + 1 ] << ( sizeof( WordType ) * 8 - 2 );
381  data_[ i ] >>= 2;
382  data_[ i ] |= bleed;
383  }
384 
385  // The last word does not need to store the last value of following words, so we
386  // can just shift it.
387  data_.back() >>= 2;
388  }
389 
390  // Adjust the size. If we now have a useless word at the end of the vector, remove it.
391  --size_;
392  if( size_ % kValuesPerWord == 0 ) {
393  data_.pop_back();
394  }
395 
396  // Assert that the size of the vector is correct.
397  assert(
398  ( size_ / kValuesPerWord ) + ( size_ % kValuesPerWord == 0 ? 0 : 1 ) == data_.size()
399  );
400 }
401 
406 {
407  size_ = 0;
408  data_.clear();
409 }
410 
411 } // namespace utils
412 } // namespace genesis
LOG_INFO
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:100
twobit_vector.hpp
genesis::utils::TwobitVector::data_at
WordType const & data_at(size_t index) const
Return a single word of the vector.
Definition: twobit_vector.cpp:176
genesis::utils::TwobitVector
Definition: twobit_vector.hpp:41
genesis::utils::TwobitVector::data_size
size_t data_size() const
Return the number of words (of type WordType) that are used to store the values in the vector.
Definition: twobit_vector.cpp:140
genesis::utils::TwobitVector::WordType
uint64_t WordType
Underlying word type for the bitvector.
Definition: twobit_vector.hpp:55
genesis::utils::TwobitVector::insert_at
void insert_at(size_t index, ValueType value)
Insert a value at a position.
Definition: twobit_vector.cpp:290
genesis::utils::TwobitVector::set
void set(size_t index, ValueType value)
Set a value at a position in the vector.
Definition: twobit_vector.cpp:269
genesis::utils::TwobitVector::operator!=
bool operator!=(TwobitVector const &other) const
Inequality operator, opposite of operator==().
Definition: twobit_vector.cpp:230
genesis::utils::TwobitVector::validate
bool validate() const
Validation function that checks some basic invariants.
Definition: twobit_vector.cpp:241
logging.hpp
Provides easy and fast logging functionality.
genesis::utils::TwobitVector::operator[]
ValueType operator[](size_t index) const
Alias for get().
Definition: twobit_vector.cpp:165
genesis::utils::TwobitVector::clear
void clear()
Clear the vector, so that it contains no data.
Definition: twobit_vector.cpp:405
genesis::utils::TwobitVector::operator==
bool operator==(TwobitVector const &other) const
Equality operator.
Definition: twobit_vector.cpp:214
genesis::utils::TwobitVector::ValueType
ValueType
Value Type enumeration for the elements of a TwobitVector.
Definition: twobit_vector.hpp:66
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::TwobitVector::kValuesPerWord
static const size_t kValuesPerWord
Constant that holds the number of values (of tyoe ValueType) that are stored in a single word in the ...
Definition: twobit_vector.hpp:79
genesis::utils::TwobitVector::remove_at
void remove_at(size_t index)
Remove the value at a position.
Definition: twobit_vector.cpp:342
genesis::utils::TwobitVector::TwobitVector
TwobitVector()=default
genesis::utils::TwobitVector::hash
WordType hash() const
Calculate a hash value of the vector, based on its size() and the xor of all its words.
Definition: twobit_vector.cpp:198
genesis::utils::TwobitVector::size
size_t size() const
Return the size of the vector, that is, how many values (of type ValueType) it currently holds.
Definition: twobit_vector.cpp:131
genesis::utils::TwobitVector::get
ValueType get(size_t index) const
Get the value at a position in the vector.
Definition: twobit_vector.cpp:149