A library for working with phylogenetic and population genetic data.
v0.32.0
base64.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2023 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace utils {
40 
41 // =================================================================================================
42 // Base 64 Encode/Decode
43 // =================================================================================================
44 
45 // Code adaptem from https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64#C++
46 
47 static const char base64_encode_lookup_[] =
48  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
49 static const char base64_pad_char_ = '=';
50 
51 template<class T>
52 std::string base64_encode_( T const& input, size_t line_length )
53 {
54  if( input.empty() ) {
55  return {};
56  }
57 
58  // Init and reserve space for result. We need both the actual length of the content,
59  // as well as the size for reserving, which might include room for new line chars.
60  std::string encoded;
61  size_t char_len = (( input.size() / 3 ) + ( input.size() % 3 > 0 )) * 4;
62  size_t char_res = char_len;
63  if( line_length > 0 ) {
64  // Reserve extra for new line chars, minus the trailing one if the division is exact.
65  char_res += char_res / line_length - ( char_res % line_length == 0 );
66  }
67  encoded.reserve( char_res );
68 
69  // We use a lambda to simplify putting chars, counting them, and wrapping lines as neccessary.
70  size_t out_cnt = 0;
71  auto put_char = [&]( char c ){
72  // Put the char
73  assert( encoded.size() + 1 <= char_res );
74  assert( encoded.size() + 1 <= encoded.capacity() );
75  encoded.append( 1, c );
76 
77  // Line wrapping as needed: Every time the modulo fires, expect for the beginning, and
78  // if the total length is an exact multiple, in which case we do not add a trailing new line.
79  ++out_cnt;
80  if( line_length > 0 && out_cnt % line_length == 0 && out_cnt < char_len ) {
81  assert( encoded.size() + 1 <= char_res );
82  assert( encoded.size() + 1 <= encoded.capacity() );
83  encoded.append( 1, '\n' );
84  }
85  };
86 
87  // Process main part of the input
88  std::uint32_t temp = 0;
89  auto it = input.begin();
90  for( std::size_t i = 0; i < input.size() / 3; ++i ) {
91  // Convert to big endian. We here also need to cast from whatever class T we have here
92  // (might be std::string, that is, char) to an unsigned char first, so that the actual
93  // byte value works, and then to the temp type, so that we can do the shifting properly.
94  // We cannot immediately cast to uint32_t here, as that has different module/casting
95  // behaviour that char to unsigned char casting. Hope that I got this right...
96  assert( it != input.end() );
97  temp = static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 16;
98  assert( it != input.end() );
99  temp += static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 8;
100  assert( it != input.end() );
101  temp += static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ ));
102 
103  // Add to output
104  put_char( base64_encode_lookup_[( temp & 0x00FC0000 ) >> 18 ]);
105  put_char( base64_encode_lookup_[( temp & 0x0003F000 ) >> 12 ]);
106  put_char( base64_encode_lookup_[( temp & 0x00000FC0 ) >> 6 ]);
107  put_char( base64_encode_lookup_[( temp & 0x0000003F ) ]);
108  }
109 
110  // Process remaining part and add padding
111  switch( input.size() % 3 ) {
112  case 2: {
113  // assert not end
114  // Convert to big endian. See above for an explanation of the double cast.
115  assert( it != input.end() );
116  temp = static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 16;
117  assert( it != input.end() );
118  temp += static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 8;
119 
120  // Add to output
121  put_char( base64_encode_lookup_[( temp & 0x00FC0000 ) >> 18 ]);
122  put_char( base64_encode_lookup_[( temp & 0x0003F000 ) >> 12 ]);
123  put_char( base64_encode_lookup_[( temp & 0x00000FC0 ) >> 6 ]);
124  put_char( base64_pad_char_);
125  break;
126  }
127  case 1: {
128  // Convert to big endian
129  // assert not end
130  assert( it != input.end() );
131  temp = (*it++) << 16;
132 
133  // Add to output
134  put_char( base64_encode_lookup_[( temp & 0x00FC0000 ) >> 18 ]);
135  put_char( base64_encode_lookup_[( temp & 0x0003F000 ) >> 12 ]);
136  put_char( base64_pad_char_);
137  put_char( base64_pad_char_);
138  break;
139  }
140  }
141  assert( it == input.end() );
142 
143  // If our initial reservation was correct, we have reached exactly capacity;
144  // however, we cannot reliably compare against the actual capacity here, as the stupid
145  // Mac OS implementation seems to not honor the reserve() properly and hence has a slightly
146  // different (bigger) capacity, so instead we compare against our intended capacity instead.
147  assert( encoded.size() == char_res );
148  return encoded;
149 }
150 
151 template<class T>
152 T base64_decode_( std::string const& input )
153 {
154  // Edge case.
155  if( input.size() == 0 ) {
156  return T{};
157  }
158 
159  // Here, we are lazy (for now) and count the number of actual (non-new-line) chars directly.
160  // This is fast enough for now. If this ever becomes a bottleneck, we shoudl refactor and
161  // only loop once instead. On the other hand, that would prohibit properly reserving the
162  // needed output space... Tradeoffs. But does not matter too much now. We just count first.
163  size_t char_cnt = 0;
164  for( auto c : input ) {
165  // Use a non-branching counter to be as fast as possible.
166  static_assert( (int)true == 1 && (int)false == 0, "Boolean counting does not work." );
167  char_cnt += ! utils::is_space( c );
168  }
169  if( char_cnt % 4 ) {
170  throw std::runtime_error( "Invalid base64 length that is not a multiple of 4");
171  }
172 
173  // Get padding
174  std::size_t padding = 0;
175  assert( char_cnt >= 4 );
176  if( input[ char_cnt - 1 ] == base64_pad_char_ ) {
177  ++padding;
178  }
179  if( input[ char_cnt - 2 ] == base64_pad_char_ ) {
180  ++padding;
181  }
182 
183  // Init and reserve space for result. We store the reserved size here, so that we can assert
184  // to not go beyond that (and cause unplanned reallocation). We cannot use decoded.capacity()
185  // for that, as this might allocate slightly differently (always more though).
186  T decoded;
187  size_t const char_res = (( char_cnt / 4 ) * 3 ) - padding;
188  decoded.reserve( char_res );
189 
190  // Hold decoded quanta
191  std::uint32_t temp = 0;
192 
193  // We are expecting ASCII encoding here!
194  static_assert( 0x41 == 'A' && 0x5A == 'Z', "Non-ASCII encoding detected. We expect ASCII." );
195  static_assert( 0x61 == 'a' && 0x7A == 'z', "Non-ASCII encoding detected. We expect ASCII." );
196  static_assert( 0x30 == '0' && 0x39 == '9', "Non-ASCII encoding detected. We expect ASCII." );
197  static_assert( 0x2B == '+' && 0x2F == '/', "Non-ASCII encoding detected. We expect ASCII." );
198 
199  // Process the input
200  auto it = input.begin();
201  while( it < input.end() ) {
202 
203  // Each set of 4 (non-new-line) chars makes up 3 bytes of data.
204  for( std::size_t i = 0; i < 4; ++i ) {
205 
206  // Skip new lines.
207  while( utils::is_space( *it )) {
208  ++it;
209  }
210 
211  // Process the current char
212  temp <<= 6;
213  if ( *it >= 0x41 && *it <= 0x5A ) {
214  temp |= *it - 0x41;
215  } else if( *it >= 0x61 && *it <= 0x7A ) {
216  temp |= *it - 0x47;
217  } else if( *it >= 0x30 && *it <= 0x39 ) {
218  temp |= *it + 0x04;
219  } else if( *it == 0x2B ) {
220  temp |= 0x3E;
221  } else if( *it == 0x2F ) {
222  temp |= 0x3F;
223  } else if( *it == base64_pad_char_ ) {
224  switch( input.end() - it ) {
225  case 1: {
226  //One pad character
227  assert( decoded.size() + 2 <= char_res );
228  assert( decoded.size() + 2 <= decoded.capacity() );
229  decoded.push_back(( temp >> 16 ) & 0x000000FF );
230  decoded.push_back(( temp >> 8 ) & 0x000000FF );
231  return decoded;
232  }
233  case 2: {
234  //Two pad characters
235  assert( decoded.size() + 1 <= char_res );
236  assert( decoded.size() + 1 <= decoded.capacity() );
237  decoded.push_back(( temp >> 10 ) & 0x000000FF );
238  return decoded;
239  }
240  default: {
241  throw std::runtime_error( "Invalid padding in base64 decoding" );
242  }
243  }
244  } else {
245  throw std::runtime_error( "Invalid character in base64 decoding" );
246  }
247 
248  ++it;
249  }
250 
251  assert( decoded.size() + 3 <= char_res );
252  assert( decoded.size() + 3 <= decoded.capacity() );
253  decoded.push_back(( temp >> 16 ) & 0x000000FF );
254  decoded.push_back(( temp >> 8 ) & 0x000000FF );
255  decoded.push_back(( temp ) & 0x000000FF );
256  }
257 
258  // If our initial reservation was correct, we have reached exactly capacity.
259  assert( decoded.size() == char_res );
260  return decoded;
261 }
262 
263 // =================================================================================================
264 // Base 64 Container Conversion
265 // =================================================================================================
266 
267 std::string base64_encode( std::vector<std::uint8_t> const& input, size_t line_length )
268 {
269  return base64_encode_( input, line_length );
270 }
271 
272 std::string base64_encode( std::string const& input, size_t line_length )
273 {
274  return base64_encode_( input, line_length );
275 }
276 
277 std::vector<std::uint8_t> base64_decode_uint8( std::string const& input )
278 {
279  using ContainerType = std::vector<std::uint8_t>;
280  return base64_decode_<ContainerType>( input );
281 }
282 
283 std::string base64_decode_string( std::string const& input )
284 {
285  using ContainerType = std::string;
286  return base64_decode_<ContainerType>( input );
287 }
288 
289 } // namespace utils
290 } // namespace genesis
genesis::utils::base64_encode_lookup_
static const char base64_encode_lookup_[]
Definition: base64.cpp:47
genesis::utils::base64_encode
std::string base64_encode(std::vector< std::uint8_t > const &input, size_t line_length)
Definition: base64.cpp:267
genesis::utils::base64_decode_
T base64_decode_(std::string const &input)
Definition: base64.cpp:152
genesis::utils::base64_pad_char_
static const char base64_pad_char_
Definition: base64.cpp:49
genesis::utils::is_space
constexpr bool is_space(char c) noexcept
Return whether a char is some form of white space charater, so either space, tab, new line,...
Definition: char.hpp:200
genesis::utils::base64_decode_uint8
std::vector< std::uint8_t > base64_decode_uint8(std::string const &input)
Definition: base64.cpp:277
genesis::utils::base64_decode_string
std::string base64_decode_string(std::string const &input)
Definition: base64.cpp:283
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::base64_encode_
std::string base64_encode_(T const &input, size_t line_length)
Definition: base64.cpp:52
char.hpp
base64.hpp