A toolkit for working with phylogenetic data.
v0.24.0
string.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <algorithm>
36 #include <cctype>
37 #include <cstdio>
38 #include <iomanip>
39 #include <iostream>
40 #include <sstream>
41 #include <stdexcept>
42 
43 #ifdef GENESIS_AVX
44  #include <immintrin.h>
45 #endif
46 
47 namespace genesis {
48 namespace utils {
49 
50 // =================================================================================================
51 // Compare
52 // =================================================================================================
53 
54 bool contains_ci( std::vector<std::string> const& haystack, std::string const& needle )
55 {
56  auto const l_needle = to_lower( needle );
57  for( auto const& val : haystack ) {
58  if( to_lower( val ) == l_needle ) {
59  return true;
60  }
61  }
62  return false;
63 }
64 
65 bool equals_ci( std::string const& lhs, std::string const& rhs)
66 {
67  const size_t sz = lhs.size();
68  if( rhs.size() != sz ) {
69  return false;
70  }
71  for( size_t i = 0; i < sz; ++i ) {
72  if( tolower( lhs[i] ) != tolower( rhs[i] ) ) {
73  return false;
74  }
75  }
76  return true;
77 }
78 
79 bool starts_with( std::string const & text, std::string const & start )
80 {
81  if (start.size() > text.size()) {
82  return false;
83  }
84  return std::equal( start.begin(), start.end(), text.begin() );
85 }
86 
87 bool ends_with( std::string const & text, std::string const & ending )
88 {
89  if (ending.size() > text.size()) {
90  return false;
91  }
92  return std::equal( ending.rbegin(), ending.rend(), text.rbegin() );
93 }
94 
95 // =================================================================================================
96 // Substrings
97 // =================================================================================================
98 
99 std::string head( std::string const& text, size_t lines )
100 {
101  // Not totally efficient, but works for now.
102  auto vec = split( text, "\n", false );
103  size_t remove = vec.size() > lines ? vec.size() - lines : 0;
104  vec.erase( vec.end() - remove, vec.end() );
105  return join( vec, "\n" );
106 }
107 
108 std::string tail( std::string const& text, size_t lines )
109 {
110  // Not totally efficient, but works for now.
111  auto vec = split( text, "\n", false );
112  size_t remove = vec.size() > lines ? vec.size() - lines : 0;
113  vec.erase( vec.begin(), vec.begin() + remove );
114  return join( vec, "\n" );
115 }
116 
117 // =================================================================================================
118 // Find and Count
119 // =================================================================================================
120 
121 size_t count_substring_occurrences( std::string const& str, std::string const& sub )
122 {
123  if (sub.length() == 0) {
124  return 0;
125  }
126 
127  size_t count = 0;
128  for(
129  size_t offset = str.find(sub);
130  offset != std::string::npos;
131  offset = str.find( sub, offset + 1 )
132  ) {
133  ++count;
134  }
135 
136  return count;
137 }
138 
142 static std::vector<std::string> split_ (
143  std::string const& string,
144  std::function<size_t ( std::string const&, size_t )> find_pos,
145  size_t advance_by,
146  const bool trim_empty
147 ) {
148  std::vector<std::string> result;
149 
150  size_t last_pos = 0;
151  while( true ) {
152  // Find first matching char.
153  size_t pos = find_pos( string, last_pos );
154 
155  // If not found, push back rest and stop.
156  if( pos == std::string::npos ) {
157  pos = string.length();
158 
159  if( pos != last_pos || !trim_empty ) {
160  result.push_back( std::string( string.data() + last_pos, pos - last_pos ));
161  }
162 
163  break;
164 
165  // If found, push back and continue.
166  } else {
167  if( pos != last_pos || !trim_empty ) {
168  result.push_back( std::string( string.data() + last_pos, pos - last_pos ));
169  }
170  }
171 
172  last_pos = pos + advance_by;
173  }
174 
175  return result;
176 }
177 
178 std::vector<std::string> split (
179  std::string const& str,
180  std::string const& delimiters,
181  const bool trim_empty
182 ) {
183  return split_(
184  str,
185  [&]( std::string const& str, size_t last_pos ){
186  return str.find_first_of( delimiters, last_pos );
187  },
188  1,
189  trim_empty
190  );
191 }
192 
193 std::vector<std::string> split (
194  std::string const& str,
195  std::function<bool(char)> delimiter_predicate,
196  const bool trim_empty
197 ) {
198  return split_(
199  str,
200  [&]( std::string const& str, size_t last_pos ){
201  // Find first matching char.
202  size_t pos = std::string::npos;
203  for( size_t i = last_pos; i < str.size(); ++i ) {
204  if( delimiter_predicate( str[i] ) ) {
205  pos = i;
206  break;
207  }
208  }
209  return pos;
210  },
211  1,
212  trim_empty
213  );
214 }
215 
216 std::vector<std::string> split_at (
217  std::string const& str,
218  std::string const& delimiter,
219  const bool trim_empty
220 ) {
221  return split_(
222  str,
223  [&]( std::string const& str, size_t last_pos ){
224  return str.find( delimiter, last_pos );
225  },
226  delimiter.size(),
227  trim_empty
228  );
229 }
230 
231 std::vector<size_t> split_range_list( std::string const& str )
232 {
233  std::vector<size_t> result;
234 
235  auto is_digits = []( std::string const& s ){
236  return trim( s ).find_first_not_of( "0123456789" ) == std::string::npos;
237  };
238 
239  auto get_number = []( std::string const& s ){
240  size_t n;
241  sscanf( trim( s ).c_str(), "%zu", &n );
242  return n;
243  };
244 
245  if( trim( str ).empty() ) {
246  return result;
247  }
248 
249  auto const lst = split( str, "," );
250  for( auto const& le : lst ) {
251  // if just digits, done. if not, split -, repeat.
252  if( is_digits( le ) ) {
253  result.push_back( get_number( le ));
254  } else {
255  auto const rng = split( le, "-" );
256  if( rng.size() != 2 || ! is_digits( rng[0] ) || ! is_digits( rng[1] ) ) {
257  throw std::runtime_error( "Invalid range list string." );
258  }
259  auto const b = get_number( rng[0] );
260  auto const e = get_number( rng[1] );
261  for( size_t i = b; i <= e; ++i ) {
262  result.push_back( i );
263  }
264  }
265  }
266 
267  std::sort( result.begin(), result.end() );
268  return result;
269 }
270 
271 // =================================================================================================
272 // Manipulate
273 // =================================================================================================
274 
275 std::string wrap(
276  std::string const& text,
277  size_t line_length
278 ) {
279  /*
280  Code is adapted from https://www.rosettacode.org/wiki/Word_wrap#C.2B.2B,
281  which is published under the GNU Free Documentation License 1.2,
282  see https://www.gnu.org/licenses/old-licenses/fdl-1.2.html
283  We fixed the handling of overly long words,
284  and added correct handling of new lines in the text.
285  It is totally inefficient, but we only need this function for small texts anyway,
286  so for now, this is good enough.
287  */
288 
289  std::ostringstream output;
290  auto const lines = split( text, "\n", false );
291  for( auto const& line : lines ) {
292  std::istringstream text_stream( line );
293  std::string word;
294 
295  if( text_stream >> word ) {
296  output << word;
297  long space_left = static_cast<long>( line_length ) - static_cast<long>( word.length() );
298  while( text_stream >> word ) {
299  if( space_left < static_cast<long>( word.length() + 1 )) {
300  output << "\n" << word;
301  space_left = line_length - word.length();
302  } else {
303  output << " " << word;
304  space_left -= word.length() + 1;
305  }
306  }
307  }
308  output << "\n";
309  }
310 
311  return output.str();
312 }
313 
314 std::string indent(
315  std::string const& text,
316  std::string const& indentation
317 ) {
318  auto ret = indentation + replace_all( text, "\n", "\n" + indentation );
319  return trim_right( ret, indentation );
320 }
321 
322 std::string replace_all (
323  std::string const& text, std::string const& search, std::string const& replace
324 ) {
325  std::string tmp = text;
326  for (size_t pos = 0; ; pos += replace.length()) {
327  pos = tmp.find(search, pos);
328 
329  if (pos == std::string::npos){
330  break;
331  }
332 
333  tmp.erase(pos, search.length());
334  tmp.insert(pos, replace);
335  }
336  return tmp;
337 }
338 
339 // inline version
340 /*
341 void replace_all(
342  std::string &s, const std::string &search, const std::string &replace
343 ) {
344  for (size_t pos = 0; ; pos += replace.length() ) {
345  pos = s.find(search, pos);
346 
347  if (pos == string::npos)
348  break;
349 
350  s.erase(pos, search.length());
351  s.insert(pos, replace);
352  }
353 }
354 */
355 
356 std::string replace_all_chars (
357  std::string const& text,
358  std::string const& search_chars,
359  char replace
360 ) {
361  auto result = text;
362  for( auto& c : result ) {
363  if( search_chars.find( c ) != std::string::npos ) {
364  c = replace;
365  }
366  }
367  return result;
368 }
369 
370 std::string trim_right (
371  std::string const& s,
372  std::string const& delimiters
373 ) {
374  auto const pos = s.find_last_not_of(delimiters);
375  if( std::string::npos == pos ) {
376  return "";
377  } else {
378  return s.substr( 0, pos + 1 );
379  }
380 }
381 
382 std::string trim_left (
383  std::string const& s,
384  std::string const& delimiters
385 ) {
386  auto const pos = s.find_first_not_of(delimiters);
387  if( std::string::npos == pos ) {
388  return "";
389  } else {
390  return s.substr(pos);
391  }
392 }
393 
394 std::string trim (
395  std::string const& s,
396  std::string const& delimiters
397 ) {
398  return trim_left(trim_right(s, delimiters), delimiters);
399 }
400 
401 // =================================================================================================
402 // Case Conversion
403 // =================================================================================================
404 
405 #ifdef GENESIS_AVX
406 
407 inline void toggle_case_ascii_inplace_avx_( std::string& str, char char_a, char char_z )
408 {
409  // We use AVX2 here, which uses 256bit = 32byte. Hence, we move through the string in strides
410  // of 32. Concidentally, the ASCII marker for "upper/lower case" also has the value 32 (0x20),
411  // which might lead to confusion when reading the following code. Be warned.
412 
413  // Fill val_32 with 32x 0x20=32
414  auto static const val_32 = _mm256_set1_epi8( 0x20 );
415 
416  // Fill mask_a with 32x 'a/A', mask_z with 32x 'z/Z'
417  auto const mask_a = _mm256_set1_epi8( char_a );
418  auto const mask_z = _mm256_set1_epi8( char_z );
419 
420  // Loop in increments of 32, which is the AVX vector size in bytes.
421  for( size_t i = 0; i < str.size() / 32 * 32; i += 32 ) {
422  auto reg = _mm256_loadu_si256( reinterpret_cast<__m256i*>( &str[i] ) );
423 
424  // mask_az contains 0x00 where the character is between 'a/A' and 'z/Z', 0xff otherwise.
425  auto mask_az = _mm256_or_si256( _mm256_cmpgt_epi8( mask_a, reg ), _mm256_cmpgt_epi8( reg, mask_z ) );
426 
427  // Toggle the upper/lower char bit (0x20), 1 means lower case, 0 means upper case.
428  reg = _mm256_xor_si256( _mm256_andnot_si256( mask_az, val_32 ), reg );
429 
430  _mm256_storeu_si256( reinterpret_cast<__m256i*>( &str[i] ), reg );
431  }
432 
433  // Convert the rest that remains by toggling the upper/lower case bit.
434  for( size_t i = str.size() / 32 * 32; i < str.size(); ++i ) {
435  if( char_a <= str[i] && str[i] <= char_z ){
436  str[i] ^= 0x20;
437  }
438  }
439 }
440 
441 #endif // GENESIS_AVX
442 
443 void to_lower_ascii_inplace( std::string& str )
444 {
445  #ifdef GENESIS_AVX
446 
447  // Toggle the ascii case bit for all values between the two mask boundaries.
448  toggle_case_ascii_inplace_avx_( str, 'A', 'Z' );
449 
450  #else // GENESIS_AVX
451 
452  // Naive implementation that might use compiler-generated vector intrinsics.
453  for( auto& c : str ){
454  c = to_lower(c);
455  }
456 
457  #endif // GENESIS_AVX
458 }
459 
460 std::string to_lower_ascii( std::string const& str )
461 {
462  auto res = str;
463  to_lower_ascii_inplace( res );
464  return res;
465 }
466 
467 void to_upper_ascii_inplace( std::string& str )
468 {
469  #ifdef GENESIS_AVX
470 
471  // Toggle the ascii case bit for all values between the two mask boundaries.
472  toggle_case_ascii_inplace_avx_( str, 'a', 'z' );
473 
474  #else // GENESIS_AVX
475 
476  // Naive implementation that might use compiler-generated vector intrinsics.
477  for( auto& c : str ){
478  c = to_upper(c);
479  }
480 
481  #endif // GENESIS_AVX
482 }
483 
484 std::string to_upper_ascii( std::string const& str )
485 {
486  auto res = str;
487  to_upper_ascii_inplace( res );
488  return res;
489 }
490 
491 // =================================================================================================
492 // Normalize
493 // =================================================================================================
494 
495 std::string escape( std::string const& text )
496 {
497  // This is slow-ish, because the string is iterated multiple times. Could be done faster.
498  std::string tmp;
499  tmp = replace_all( text, "\r", "\\r" );
500  tmp = replace_all( tmp, "\n", "\\n" );
501  tmp = replace_all( tmp, "\t", "\\t" );
502  tmp = replace_all( tmp, "\"", "\\\"" );
503  tmp = replace_all( tmp, "\\", "\\\\" );
504  return tmp;
505 }
506 
507 std::string deescape( std::string const& text )
508 {
509  // Prepare a string that might be a bit too big, but saves reallocation.
510  std::string tmp;
511  tmp.reserve( text.size() );
512 
513  // Copy from text to tmp string, while deescaping.
514  for( size_t i = 0; i < text.size(); ++i ) {
515  if( text[ i ] == '\\' ) {
516  if( i + 1 >= text.size() ){
517  break;
518  }
519 
520  tmp += deescape( text[ i + 1 ] );
521  ++i;
522  } else {
523  tmp += text[ i ];
524  }
525  }
526  return tmp;
527 }
528 
529 char deescape( char c )
530 {
531  switch( c ) {
532  case 'r' :
533  return '\r';
534 
535  case 'n' :
536  return '\n';
537 
538  case 't' :
539  return '\t';
540 
541  default :
542  return c;
543  }
544 }
545 
546 // =================================================================================================
547 // Output
548 // =================================================================================================
549 
550 std::string repeat( std::string const& word, size_t times )
551 {
552  // Init and avoid repeated reallocation.
553  std::string result;
554  result.reserve( times * word.length() );
555 
556  // Concat repeats.
557  for( size_t i = 0; i < times; ++i ) {
558  result += word;
559  }
560  return result;
561 }
562 
563 std::string to_string_leading_zeros( size_t value, size_t length )
564 {
565  std::stringstream ss;
566  ss << std::setw( length ) << std::setfill( '0' ) << value;
567  return ss.str();
568 }
569 
570 std::string to_string_precise( double const value, int const precision )
571 {
572  // Simple and straight forward.
573  std::ostringstream s;
574  s << std::fixed << std::setprecision( precision ) << value;
575  return s.str();
576 }
577 
578 std::string to_string_rounded( double const value, int const precision )
579 {
580  // Get fixed precision string.
581  std::ostringstream s;
582  s << std::fixed << std::setprecision( precision ) << value;
583  auto str = s.str();
584 
585  // Truncate trailing zeros, unless there are only zeros after the decimal point. Then, also
586  // delete the decimal point.
587  size_t offset = 1;
588  size_t const last_nonzero = str.find_last_not_of('0');
589  if( str[ last_nonzero ] == '.' ) {
590  offset = 0;
591  }
592  str.erase( last_nonzero + offset, std::string::npos );
593  return str;
594 }
595 
596 } // namespace utils
597 } // namespace genesis
void offset(Histogram &h, double value)
Definition: operations.cpp:47
std::string escape(std::string const &text)
Return a string where special chars are replaces by their escape sequence.
Definition: string.cpp:495
std::string deescape(std::string const &text)
Return a string where backslash-escaped characters are transformed into their respective string form...
Definition: string.cpp:507
std::string to_string_rounded(double const value, int const precision)
Return a string representation of the input value, using the provided precision value (determining it...
Definition: string.cpp:578
bool starts_with(std::string const &text, std::string const &start)
Return whether a string starts with another string.
Definition: string.cpp:79
std::string repeat(std::string const &word, size_t times)
Take a string and repeat it a given number of times.
Definition: string.cpp:550
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
Definition: string.cpp:467
std::vector< size_t > split_range_list(std::string const &str)
Split a string containing positive interger numbers into its parts and resolve ranges.
Definition: string.cpp:231
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
Definition: string.cpp:484
std::string join(T const &v, std::string const &delimiter=", ")
Return a string where the elements of a container v are joined using the string delimiter in between ...
Definition: string.hpp:392
std::string indent(std::string const &text, std::string const &indentation)
Indent each line of text with indentation and return the result.
Definition: string.cpp:314
std::string trim_left(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with right trimmed white spaces.
Definition: string.cpp:382
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string...
Definition: string.cpp:322
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:370
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:394
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
Definition: string.cpp:443
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
Definition: string.cpp:356
std::string to_string_precise(double const value, int const precision)
Return a precise string representation of the input value, using the provided precision value (determ...
Definition: string.cpp:570
std::string to_string_leading_zeros(size_t value, size_t length)
Return a string representation of a size_t value with a fixed length, that is, by adding leading zero...
Definition: string.cpp:563
std::string wrap(std::string const &text, size_t line_length)
Wrap a text at a given line_length.
Definition: string.cpp:275
static std::vector< std::string > split_(std::string const &string, std::function< size_t(std::string const &, size_t)> find_pos, size_t advance_by, const bool trim_empty)
Local function that does the work for the split cuntions.
Definition: string.cpp:142
std::vector< std::string > split(std::string const &str, std::string const &delimiters, const bool trim_empty)
Spilt a string into parts, given a delimiters set of chars.
Definition: string.cpp:178
std::string head(std::string const &text, size_t lines)
Return the first lines of the text.
Definition: string.cpp:99
Provides some commonly used string utility functions.
bool equals_ci(std::string const &lhs, std::string const &rhs)
Compare two strings case insensitive.
Definition: string.cpp:65
bool ends_with(std::string const &text, std::string const &ending)
Return whether a string ends with another string.
Definition: string.cpp:87
std::string tail(std::string const &text, size_t lines)
Return the last lines of the text.
Definition: string.cpp:108
size_t count_substring_occurrences(std::string const &str, std::string const &sub)
Return the number of (possibly overlapping) occurrences of a substring in a string.
Definition: string.cpp:121
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:227
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:216
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:218
bool equal(Tree const &lhs, Tree const &rhs, std::function< bool(TreeNode const &, TreeNode const &) > node_comparator, std::function< bool(TreeEdge const &, TreeEdge const &) > edge_comparator)
Compare two trees for equality given binary comparator functionals for their nodes and edges...
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:460
bool contains_ci(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive.
Definition: string.cpp:54