A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
twobit_vector.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <assert.h>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace utils {
40 
41 // ================================================================================================
42 // Typedefs and Constants
43 // ================================================================================================
44 
48 const TwobitVector::WordType TwobitVector::all_0_ = 0ul;
49 
53 const TwobitVector::WordType TwobitVector::all_1_ = (((1ul << 32) - 1) << 32) + ((1ul << 32) - 1);
54 
67 const TwobitVector::WordType TwobitVector::bit_mask_[ TwobitVector::kValuesPerWord ] =
68 {
69  3ul << 0, 3ul << 2, 3ul << 4, 3ul << 6, 3ul << 8, 3ul << 10, 3ul << 12, 3ul << 14,
70  3ul << 16, 3ul << 18, 3ul << 20, 3ul << 22, 3ul << 24, 3ul << 26, 3ul << 28, 3ul << 30,
71  3ul << 32, 3ul << 34, 3ul << 36, 3ul << 38, 3ul << 40, 3ul << 42, 3ul << 44, 3ul << 46,
72  3ul << 48, 3ul << 50, 3ul << 52, 3ul << 54, 3ul << 56, 3ul << 58, 3ul << 60, 3ul << 62
73 };
74 
91 const TwobitVector::WordType TwobitVector::ones_mask_[ TwobitVector::kValuesPerWord ] =
92 {
93  TwobitVector::all_0_, TwobitVector::all_1_ >> 62,
94  TwobitVector::all_1_ >> 60, TwobitVector::all_1_ >> 58,
95  TwobitVector::all_1_ >> 56, TwobitVector::all_1_ >> 54,
96  TwobitVector::all_1_ >> 52, TwobitVector::all_1_ >> 50,
97  TwobitVector::all_1_ >> 48, TwobitVector::all_1_ >> 46,
98  TwobitVector::all_1_ >> 44, TwobitVector::all_1_ >> 42,
99  TwobitVector::all_1_ >> 40, TwobitVector::all_1_ >> 38,
100  TwobitVector::all_1_ >> 36, TwobitVector::all_1_ >> 34,
101  TwobitVector::all_1_ >> 32, TwobitVector::all_1_ >> 30,
102  TwobitVector::all_1_ >> 28, TwobitVector::all_1_ >> 26,
103  TwobitVector::all_1_ >> 24, TwobitVector::all_1_ >> 22,
104  TwobitVector::all_1_ >> 20, TwobitVector::all_1_ >> 18,
105  TwobitVector::all_1_ >> 16, TwobitVector::all_1_ >> 14,
106  TwobitVector::all_1_ >> 12, TwobitVector::all_1_ >> 10,
107  TwobitVector::all_1_ >> 8, TwobitVector::all_1_ >> 6,
108  TwobitVector::all_1_ >> 4, TwobitVector::all_1_ >> 2
109 };
110 
111 // ================================================================================================
112 // Constructors and Rule of Five
113 // ================================================================================================
114 
119  : size_( size )
120  , data_( ( size / kValuesPerWord ) + ( size % kValuesPerWord == 0 ? 0 : 1 ), 0 )
121 {}
122 
123 // ================================================================================================
124 // Accessors
125 // ================================================================================================
126 
131 size_t TwobitVector::size() const
132 {
133  return size_;
134 }
135 
141 {
142  assert( ( size_ / kValuesPerWord ) + ( size_ % kValuesPerWord == 0 ? 0 : 1 ) == data_.size() );
143  return data_.size();
144 }
145 
150 {
151  if (index >= size_) {
152  throw std::out_of_range( "TwobitVector::get: Invalid index." );
153  }
154 
155  // Get the two-bit value at index, still at its original position in the word.
156  auto segment = data_[ index / kValuesPerWord ] & bit_mask_[ index % kValuesPerWord ];
157 
158  // Shift it to the right, so that we can cast it to a value type.
159  return static_cast< ValueType >( segment >> ( 2 * ( index % kValuesPerWord )));
160 }
161 
166 {
167  return get( index );
168 }
169 
176 TwobitVector::WordType const& TwobitVector::data_at( size_t index ) const
177 {
178  return data_.at( index );
179 }
180 
188 {
189  return data_.at( index );
190 }
191 
199 {
200  WordType result = static_cast< WordType >( size() );
201  for( auto s : data_ ) {
202  result ^= s;
203  }
204  return result;
205 }
206 
207 // ================================================================================================
208 // Operators
209 // ================================================================================================
210 
214 bool TwobitVector::operator == ( TwobitVector const& other ) const
215 {
216  if( size_ != other.size_ ) {
217  return false;
218  }
219  for( size_t i = 0; i < data_.size(); ++i ) {
220  if( data_[i] != other.data_[i] ) {
221  return false;
222  }
223  }
224  return true;
225 }
226 
230 bool TwobitVector::operator != ( TwobitVector const& other ) const
231 {
232  return !(*this == other);
233 }
234 
242 {
243  // Check if the size is correct.
244  if( ( size_ / kValuesPerWord ) + ( size_ % kValuesPerWord == 0 ? 0 : 1 ) != data_.size() ) {
245  LOG_INFO << "Sizes does not match.";
246  return false;
247  }
248 
249  // Check if the zero padding at the end is correct
250  // (only if we do have padding though).
251  if(
252  ( size_ % kValuesPerWord != 0 )
253  && (( data_.back() & ~ ones_mask_[ size_ % kValuesPerWord ] ) != 0ul )
254  ) {
255  LOG_INFO << "Invalid padding bits.";
256  return false;
257  }
258 
259  return true;
260 }
261 
262 // ================================================================================================
263 // Modifiers
264 // ================================================================================================
265 
269 void TwobitVector::set( size_t index, TwobitVector::ValueType value )
270 {
271  if( index >= size_ ) {
272  throw std::out_of_range( "TwobitVector::set: Invalid index." );
273  }
274 
275  // Shift the value to the correct position within the word.
276  auto const tmp = static_cast< WordType >( value ) << ( 2 * ( index % kValuesPerWord ));
277 
278  // Unset the bits at the position in the word, and reset them to the value.
279  // (Unfortunately, we are not operation on single bits, so a simple `and` or `or`
280  // does not work here. Maybe there are smarter ways, but this one works for now.)
281  data_[ index / kValuesPerWord ] &= ~ bit_mask_[ index % kValuesPerWord ];
282  data_[ index / kValuesPerWord ] |= tmp;
283 }
284 
291 {
292  if( index > size_ ) {
293  throw std::out_of_range( "TwobitVector::insert_at: Invalid index." );
294  }
295 
296  // Shorthands.
297  size_t const word_id = index / kValuesPerWord;
298  size_t const segm_id = index % kValuesPerWord;
299 
300  // If the last word is fully used, we need to add a new one.
301  if( size_ % kValuesPerWord == 0 ) {
302  data_.push_back( 0ul );
303  }
304 
305  // Shift all the data right of the insertion word by one.
306  for( size_t i = data_.size() - 1; i > word_id; --i ) {
307 
308  // Shift the data by one value. We do not loose anything, because the value that is
309  // shifted out of the word was already processed in a previous iteration of this
310  // loop (or is zero anyway in the first iteration).
311  data_[ i ] <<= 2;
312 
313  // Get the value of the next word (the one that will be shifted away in the next
314  // iteration of this loop), and move it to the other end of the word.
315  auto bleed = data_[ i - 1 ] >> ( sizeof( WordType ) * 8 - 2 );
316 
317  // Now add this to the current word, so that the bits that are zero (because of the
318  // previous shifting) are filled again with the correct value.
319  data_[ i ] |= bleed;
320  }
321 
322  // Get the values in the insertion word that are right of the insertion position.
323  auto const remainder = data_[ word_id ] & ( ~ ones_mask_[ segm_id ] );
324 
325  // Delete those values in the word.
326  data_[ word_id ] &= ones_mask_[ segm_id ];
327 
328  // Restore them, shifted by one position. Now we have room for the actual insertion.
329  data_[ word_id ] |= ( remainder << 2 );
330 
331  // Shift the insertion value to its position, store it in the word, and adjust the size.
332  auto const val_shifted = static_cast< WordType >( value ) << ( 2 * segm_id );
333  data_[ word_id ] |= val_shifted;
334  ++size_;
335 }
336 
342 void TwobitVector::remove_at( size_t index )
343 {
344  if( index >= size_ ) {
345  throw std::out_of_range( "TwobitVector::remove_at: Invalid index." );
346  }
347 
348  // Shorthands.
349  size_t const word_id = index / kValuesPerWord;
350  size_t const segm_id = index % kValuesPerWord;
351 
352  // If the position in the word is not the last:
353  if( segm_id < kValuesPerWord - 1 ) {
354 
355  // Get the part of the word that needs to be shifted.
356  auto remainder = data_[ word_id ] & ( ~ ones_mask_[ segm_id + 1 ] );
357 
358  // Delete this part.
359  data_[ word_id ] &= ones_mask_[ segm_id ];
360 
361  // Reset it with the shifted rest values.
362  data_[ word_id ] |= ( remainder >> 2 );
363 
364  // If it is the last position in the word, we do not need to shift a rest,
365  // but can just unset the last value.
366  } else {
367  data_[ word_id ] &= ones_mask_[ segm_id ];
368  }
369 
370  // If the word of the deletion is not the last, we need to shift values.
371  if( word_id < data_.size() - 1 ) {
372 
373  // Get the first value of the next word and store it as the last value in the
374  // word where we just deleted a value.
375  auto bleed = data_[ word_id + 1 ] << ( sizeof( WordType ) * 8 - 2 );
376  data_[ word_id ] |= bleed;
377 
378  // Move all values in the remaining words (except the last one) by one.
379  for( size_t i = word_id + 1; i < data_.size() - 1; ++i ) {
380  bleed = data_[ i + 1 ] << ( sizeof( WordType ) * 8 - 2 );
381  data_[ i ] >>= 2;
382  data_[ i ] |= bleed;
383  }
384 
385  // The last word does not need to store the last value of following words, so we
386  // can just shift it.
387  data_.back() >>= 2;
388  }
389 
390  // Adjust the size. If we now have a useless word at the end of the vector, remove it.
391  --size_;
392  if( size_ % kValuesPerWord == 0 ) {
393  data_.pop_back();
394  }
395 
396  // Assert that the size of the vector is correct.
397  assert(
398  ( size_ / kValuesPerWord ) + ( size_ % kValuesPerWord == 0 ? 0 : 1 ) == data_.size()
399  );
400 }
401 
406 {
407  size_ = 0;
408  data_.clear();
409 }
410 
411 } // namespace utils
412 } // namespace genesis
void insert_at(size_t index, ValueType value)
Insert a value at a position.
bool operator==(TwobitVector const &other) const
Equality operator.
size_t data_size() const
Return the number of words (of type WordType) that are used to store the values in the vector...
static const size_t kValuesPerWord
Constant that holds the number of values (of tyoe ValueType) that are stored in a single word in the ...
size_t size() const
Return the size of the vector, that is, how many values (of type ValueType) it currently holds...
WordType const & data_at(size_t index) const
Return a single word of the vector.
void remove_at(size_t index)
Remove the value at a position.
uint64_t WordType
Underlying word type for the bitvector.
void set(size_t index, ValueType value)
Set a value at a position in the vector.
ValueType operator[](size_t index) const
Alias for get().
WordType hash() const
Calculate a hash value of the vector, based on its size() and the xor of all its words.
Provides easy and fast logging functionality.
bool validate() const
Validation function that checks some basic invariants.
void clear()
Clear the vector, so that it contains no data.
ValueType
Value Type enumeration for the elements of a TwobitVector.
bool operator!=(TwobitVector const &other) const
Inequality operator, opposite of operator==().
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98
ValueType get(size_t index) const
Get the value at a position in the vector.