A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/math/twobit_vector/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <algorithm>
34 #include <assert.h>
35 #include <stdexcept>
36 
37 namespace genesis {
38 namespace utils {
39 
40 // =================================================================================================
41 // Strings
42 // =================================================================================================
43 
50 {
51  switch( site ) {
52  case 'a':
53  case 'A':
55 
56  case 'c':
57  case 'C':
59 
60  case 'g':
61  case 'G':
63 
64  case 't':
65  case 'T':
67 
68  default:
69  throw std::runtime_error( "Invalid nucleic acid." );
70  }
71 }
72 
79 {
80  switch( value ) {
81  case TwobitVector::ValueType::A: return 'A';
82  case TwobitVector::ValueType::C: return 'C';
83  case TwobitVector::ValueType::G: return 'G';
84  case TwobitVector::ValueType::T: return 'T';
85  default:
86  throw std::runtime_error( "Invalid twobit value." );
87  }
88 }
89 
93 TwobitVector from_nucleic_acids( std::string const& sequence )
94 {
95  // We set each value individually.
96  // Shifting them should be faster - future optimization!
97  auto result = TwobitVector( sequence.size() );
98  for( size_t i = 0; i < sequence.size(); ++i ) {
99  result.set( i, translate_from_nucleic_acid( sequence[i] ));
100  }
101  return result;
102 }
103 
107 std::string to_nucleic_acids( TwobitVector const& vec )
108 {
109  std::string result;
110  result.reserve( vec.size() );
111 
112  for( size_t i = 0; i < vec.size(); ++i ) {
113  result += translate_to_nucleic_acid( vec[i] );
114  }
115  return result;
116 }
117 
124 std::string bitstring( TwobitVector const& vec )
125 {
126  // Put each word on a new line.
127  std::string res = "";
128  for( size_t i = 0; i < vec.data_size(); ++i ) {
129  res += bitstring( vec.data_at( i )) + "\n";
130  }
131  return res;
132 }
133 
140 std::string bitstring( TwobitVector::WordType const& vec )
141 {
142  // This is an ugly quick hack function. Would be nicer to use bitmasks, but they are hidden
143  // inside the TwobitVector class and exposing them just for this purpuse is also not nice.
144 
145  // Make a copy, so that we can shift away the processed parts.
146  auto cpy = vec;
147  std::string res;
148  res.reserve( 96 );
149 
150  // Go through the word two bits at a time, and store the right-most bits
151  // in the string. This way, we obtain a reverse order string, so we need to reverse
152  // it later again. Also, note that the bit strings for 1 and 2 are reverses in the switch
153  // statement because of this.
154  for( size_t i = 0; i < TwobitVector::kValuesPerWord; ++i ) {
155  auto tmp = cpy & 0x3;
156  switch( tmp ) {
157  case 0x0:
158  res += "00";
159  break;
160 
161  case 0x1:
162  res += "10";
163  break;
164 
165  case 0x2:
166  res += "01";
167  break;
168 
169  case 0x3:
170  res += "11";
171  break;
172 
173  default:
174  assert( false );
175  }
176  if( i < TwobitVector::kValuesPerWord - 1 ) {
177  res += " ";
178  }
179  cpy >>= 2;
180  }
181 
182  std::reverse( res.begin(), res.end() );
183  return res;
184 }
185 
186 } // namespace utils
187 } // namespace genesis
TwobitVector::ValueType translate_from_nucleic_acid(char site)
Translate a char into TwobitVector::ValueType.
size_t data_size() const
Return the number of words (of type WordType) that are used to store the values in the vector...
static const size_t kValuesPerWord
Constant that holds the number of values (of tyoe ValueType) that are stored in a single word in the ...
std::string to_nucleic_acids(TwobitVector const &vec)
Turn a TwobitVector into its string representation of nucleic acids.
size_t size() const
Return the size of the vector, that is, how many values (of type ValueType) it currently holds...
WordType const & data_at(size_t index) const
Return a single word of the vector.
std::string bitstring(TwobitVector const &vec)
Return a string with a bit-representation of a TwobitVector.
TwobitVector from_nucleic_acids(std::string const &sequence)
Turn a string of nucleic acids into a TwobitVector.
uint64_t WordType
Underlying word type for the bitvector.
void set(size_t index, ValueType value)
Set a value at a position in the vector.
char translate_to_nucleic_acid(TwobitVector::ValueType value)
Translate a TwobitVector::ValueType into its char representation.
ValueType
Value Type enumeration for the elements of a TwobitVector.