A library for working with phylogenetic and population genetic data.
v0.27.0
base64.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace utils {
40 
41 // =================================================================================================
42 // Base 64 Encode/Decode
43 // =================================================================================================
44 
45 // Code adaptem from https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64#C++
46 
47 static const char base64_encode_lookup_[] =
48  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
49 static const char base64_pad_char_ = '=';
50 
51 template<class T>
52 std::string base64_encode_( T const& input, size_t line_length )
53 {
54  if( input.empty() ) {
55  return {};
56  }
57 
58  // Init and reserve space for result. We need both the actual length of the content,
59  // as well as the size for reserving, which might include room for new line chars.
60  std::string encoded;
61  size_t char_len = (( input.size() / 3 ) + ( input.size() % 3 > 0 )) * 4;
62  size_t char_res = char_len;
63  if( line_length > 0 ) {
64  // Reserve extra for new line chars, minus the trailing one if the division is exact.
65  char_res += char_res / line_length - ( char_res % line_length == 0 );
66  }
67  encoded.reserve( char_res );
68 
69  // We use a lambda to simplify putting chars, counting them, and wrapping lines as neccessary.
70  size_t out_cnt = 0;
71  auto put_char = [&]( char c ){
72  // Put the char
73  assert( encoded.size() + 1 <= char_res );
74  assert( encoded.size() + 1 <= encoded.capacity() );
75  encoded.append( 1, c );
76 
77  // Line wrapping as needed: Every time the modulo fires, expect for the beginning, and
78  // if the total length is an exact multiple, in which case we do not add a trailing new line.
79  ++out_cnt;
80  if( line_length > 0 && out_cnt % line_length == 0 && out_cnt < char_len ) {
81  assert( encoded.size() + 1 <= char_res );
82  assert( encoded.size() + 1 <= encoded.capacity() );
83  encoded.append( 1, '\n' );
84  }
85  };
86 
87  // Process main part of the input
88  std::uint32_t temp = 0;
89  auto it = input.begin();
90  for( std::size_t i = 0; i < input.size() / 3; ++i ) {
91  // Convert to big endian. We here also need to cast from whatever class T we have here
92  // (might be std::string, that is, char) to an unsigned char first, so that the actual
93  // byte value works, and then to the temp type, so that we can do the shifting properly.
94  // We cannot immediately cast to uint32_t here, as that has different module/casting
95  // behaviour that char to unsigned char casting. Hope that I got this right...
96  temp = static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 16;
97  temp += static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 8;
98  temp += static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ ));
99 
100  // Add to output
101  put_char( base64_encode_lookup_[( temp & 0x00FC0000 ) >> 18 ]);
102  put_char( base64_encode_lookup_[( temp & 0x0003F000 ) >> 12 ]);
103  put_char( base64_encode_lookup_[( temp & 0x00000FC0 ) >> 6 ]);
104  put_char( base64_encode_lookup_[( temp & 0x0000003F ) ]);
105  }
106 
107  // Process remaining part and add padding
108  switch( input.size() % 3 ) {
109  case 2: {
110  // Convert to big endian. See above for an explanation of the double cast.
111  temp = static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 16;
112  temp += static_cast<std::uint32_t>( static_cast<unsigned char>( *it++ )) << 8;
113 
114  // Add to output
115  put_char( base64_encode_lookup_[( temp & 0x00FC0000 ) >> 18 ]);
116  put_char( base64_encode_lookup_[( temp & 0x0003F000 ) >> 12 ]);
117  put_char( base64_encode_lookup_[( temp & 0x00000FC0 ) >> 6 ]);
118  put_char( base64_pad_char_);
119  break;
120  }
121  case 1: {
122  // Convert to big endian
123  temp = (*it++) << 16;
124 
125  // Add to output
126  put_char( base64_encode_lookup_[( temp & 0x00FC0000 ) >> 18 ]);
127  put_char( base64_encode_lookup_[( temp & 0x0003F000 ) >> 12 ]);
128  put_char( base64_pad_char_);
129  put_char( base64_pad_char_);
130  break;
131  }
132  }
133 
134  // If our initial reservation was correct, we have reached exactly capacity;
135  // however, we cannot reliably compare against the actual capacity here, as the stupid
136  // Mac OS implementation seems to not honor the reserve() properly and hence has a slightly
137  // different (bigger) capacity, so instead we compare against our intended capacity instead.
138  assert( encoded.size() == char_res );
139  return encoded;
140 }
141 
142 template<class T>
143 T base64_decode_( std::string const& input )
144 {
145  // Edge case.
146  if( input.size() == 0 ) {
147  return T{};
148  }
149 
150  // Here, we are lazy (for now) and count the number of actual (non-new-line) chars directly.
151  // This is fast enough for now. If this ever becomes a bottleneck, we shoudl refactor and
152  // only loop once instead. On the other hand, that would prohibit properly reserving the
153  // needed output space... Tradeoffs. But does not matter too much now. We just count first.
154  size_t char_cnt = 0;
155  for( auto c : input ) {
156  // Use a non-branching counter to be as fast as possible.
157  static_assert( (int)true == 1 && (int)false == 0, "Boolean counting does not work." );
158  char_cnt += ! utils::is_space( c );
159  }
160  if( char_cnt % 4 ) {
161  throw std::runtime_error( "Invalid base64 length that is not a multiple of 4");
162  }
163 
164  // Get padding
165  std::size_t padding = 0;
166  assert( char_cnt >= 4 );
167  if( input[ char_cnt - 1 ] == base64_pad_char_ ) {
168  ++padding;
169  }
170  if( input[ char_cnt - 2 ] == base64_pad_char_ ) {
171  ++padding;
172  }
173 
174  // Init and reserve space for result. We store the reserved size here, so that we can assert
175  // to not go beyond that (and cause unplanned reallocation). We cannot use decoded.capacity()
176  // for that, as this might allocate slightly differently (always more though).
177  T decoded;
178  size_t const char_res = (( char_cnt / 4 ) * 3 ) - padding;
179  decoded.reserve( char_res );
180 
181  // Hold decoded quanta
182  std::uint32_t temp = 0;
183 
184  // We are expecting ASCII encoding here!
185  static_assert( 0x41 == 'A' && 0x5A == 'Z', "Non-ASCII encoding detected. We expect ASCII." );
186  static_assert( 0x61 == 'a' && 0x7A == 'z', "Non-ASCII encoding detected. We expect ASCII." );
187  static_assert( 0x30 == '0' && 0x39 == '9', "Non-ASCII encoding detected. We expect ASCII." );
188  static_assert( 0x2B == '+' && 0x2F == '/', "Non-ASCII encoding detected. We expect ASCII." );
189 
190  // Process the input
191  auto it = input.begin();
192  while( it < input.end() ) {
193 
194  // Each set of 4 (non-new-line) chars makes up 3 bytes of data.
195  for( std::size_t i = 0; i < 4; ++i ) {
196 
197  // Skip new lines.
198  while( utils::is_space( *it )) {
199  ++it;
200  }
201 
202  // Process the current char
203  temp <<= 6;
204  if ( *it >= 0x41 && *it <= 0x5A ) {
205  temp |= *it - 0x41;
206  } else if( *it >= 0x61 && *it <= 0x7A ) {
207  temp |= *it - 0x47;
208  } else if( *it >= 0x30 && *it <= 0x39 ) {
209  temp |= *it + 0x04;
210  } else if( *it == 0x2B ) {
211  temp |= 0x3E;
212  } else if( *it == 0x2F ) {
213  temp |= 0x3F;
214  } else if( *it == base64_pad_char_ ) {
215  switch( input.end() - it ) {
216  case 1: {
217  //One pad character
218  assert( decoded.size() + 2 <= char_res );
219  assert( decoded.size() + 2 <= decoded.capacity() );
220  decoded.push_back(( temp >> 16 ) & 0x000000FF );
221  decoded.push_back(( temp >> 8 ) & 0x000000FF );
222  return decoded;
223  }
224  case 2: {
225  //Two pad characters
226  assert( decoded.size() + 1 <= char_res );
227  assert( decoded.size() + 1 <= decoded.capacity() );
228  decoded.push_back(( temp >> 10 ) & 0x000000FF );
229  return decoded;
230  }
231  default: {
232  throw std::runtime_error( "Invalid padding in base64 decoding" );
233  }
234  }
235  } else {
236  throw std::runtime_error( "Invalid character in base64 decoding" );
237  }
238 
239  ++it;
240  }
241 
242  assert( decoded.size() + 3 <= char_res );
243  assert( decoded.size() + 3 <= decoded.capacity() );
244  decoded.push_back(( temp >> 16 ) & 0x000000FF );
245  decoded.push_back(( temp >> 8 ) & 0x000000FF );
246  decoded.push_back(( temp ) & 0x000000FF );
247  }
248 
249  // If our initial reservation was correct, we have reached exactly capacity.
250  assert( decoded.size() == char_res );
251  return decoded;
252 }
253 
254 // =================================================================================================
255 // Base 64 Container Conversion
256 // =================================================================================================
257 
258 std::string base64_encode( std::vector<std::uint8_t> const& input, size_t line_length )
259 {
260  return base64_encode_( input, line_length );
261 }
262 
263 std::string base64_encode( std::string const& input, size_t line_length )
264 {
265  return base64_encode_( input, line_length );
266 }
267 
268 std::vector<std::uint8_t> base64_decode_uint8( std::string const& input )
269 {
270  using ContainerType = std::vector<std::uint8_t>;
271  return base64_decode_<ContainerType>( input );
272 }
273 
274 std::string base64_decode_string( std::string const& input )
275 {
276  using ContainerType = std::string;
277  return base64_decode_<ContainerType>( input );
278 }
279 
280 } // namespace utils
281 } // namespace genesis
genesis::utils::base64_encode_lookup_
static const char base64_encode_lookup_[]
Definition: base64.cpp:47
genesis::utils::base64_encode
std::string base64_encode(std::vector< std::uint8_t > const &input, size_t line_length)
Definition: base64.cpp:258
genesis::utils::base64_decode_
T base64_decode_(std::string const &input)
Definition: base64.cpp:143
genesis::utils::base64_pad_char_
static const char base64_pad_char_
Definition: base64.cpp:49
genesis::utils::is_space
constexpr bool is_space(char c) noexcept
Return whether a char is some form of white space charater, so either space, tab, new line,...
Definition: char.hpp:200
genesis::utils::base64_decode_uint8
std::vector< std::uint8_t > base64_decode_uint8(std::string const &input)
Definition: base64.cpp:268
genesis::utils::base64_decode_string
std::string base64_decode_string(std::string const &input)
Definition: base64.cpp:274
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::base64_encode_
std::string base64_encode_(T const &input, size_t line_length)
Definition: base64.cpp:52
char.hpp
base64.hpp