A library for working with phylogenetic and population genetic data.
v0.32.0
string.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <cctype>
39 #include <cstdio>
40 #include <iomanip>
41 #include <iostream>
42 #include <limits>
43 #include <sstream>
44 #include <stdexcept>
45 
46 #if defined(GENESIS_AVX) || defined(GENESIS_AVX2) || defined(GENESIS_AVX512)
47 
48  #include <immintrin.h>
49 
50 #endif
51 
52 namespace genesis {
53 namespace utils {
54 
55 // =================================================================================================
56 // Compare and Find
57 // =================================================================================================
58 
59 bool contains_ci( std::vector<std::string> const& haystack, std::string const& needle )
60 {
61  // Lazy. Could be more optimized for sure, if this ever becomes a bottleneck.
62  auto const l_needle = to_lower( needle );
63  for( auto const& val : haystack ) {
64  if( to_lower( val ) == l_needle ) {
65  return true;
66  }
67  }
68  return false;
69 }
70 
71 bool contains_ci_alnum( std::vector<std::string> const& haystack, std::string const& needle )
72 {
73  // Lazy. Could be more optimized for sure, if this ever becomes a bottleneck.
74  auto const lower_alnum_needle = to_lower( remove_all_non_alnum( needle ));
75  for( auto const& val : haystack ) {
76  auto alnum_val = remove_all_non_alnum( val );
77  if( equals_ci( alnum_val, lower_alnum_needle )) {
78  return true;
79  }
80  }
81  return false;
82 }
83 
84 int strcasecmp( char const* s1, char const* s2 )
85 {
86  // Avoid code duplication at minimal runtime cost.
87  return strncasecmp( s1, s2, std::numeric_limits<size_t>::max() );
88 }
89 
90 int strncasecmp( char const* s1, char const* s2, size_t n )
91 {
92  // Fast edge case
93  if( s1 == s2 ) {
94  return 0;
95  }
96 
97  // Need to convert to unsigned, https://stackoverflow.com/a/51992138
98  auto ucs1 = reinterpret_cast<unsigned char const*>( s1 );
99  auto ucs2 = reinterpret_cast<unsigned char const*>( s2 );
100 
101  // Run the comparison
102  int d = 0;
103  for( ; n != 0; n-- ) {
104  int const c1 = tolower( *ucs1++ );
105  int const c2 = tolower( *ucs2++ );
106  d = c1 - c2;
107  if(( d != 0 ) || ( c1 == '\0' )) {
108  break;
109  }
110  }
111  return d;
112 }
113 
114 bool equals_ci( std::string const& lhs, std::string const& rhs)
115 {
116  const size_t sz = lhs.size();
117  if( rhs.size() != sz ) {
118  return false;
119  }
120  for( size_t i = 0; i < sz; ++i ) {
121  if( tolower( lhs[i] ) != tolower( rhs[i] ) ) {
122  return false;
123  }
124  }
125  return true;
126 }
127 
128 bool equals_ci_alnum( std::string const& lhs, std::string const& rhs )
129 {
130  // Lazy. Could be more optimized for sure, if this ever becomes a bottleneck.
131  auto const alnum_lhs = remove_all_non_alnum( lhs );
132  auto const alnum_rhs = remove_all_non_alnum( rhs );
133  return equals_ci( alnum_lhs, alnum_rhs );
134 }
135 
136 bool starts_with( std::string const& text, std::string const& prefix )
137 {
138  if( prefix.size() > text.size() ) {
139  return false;
140  }
141  return std::equal( prefix.begin(), prefix.end(), text.begin() );
142 }
143 
144 bool starts_with( std::string const& text, std::string const& prefix, std::string& suffix )
145 {
146  auto const res = starts_with( text, prefix );
147  if( res ) {
148  assert( prefix.size() <= text.size() );
149  suffix = text.substr( prefix.size() );
150  }
151  return res;
152 }
153 
154 bool starts_with_ci( std::string const& text, std::string const& prefix )
155 {
156  // Lazy. Could be more optimized for sure, if this ever becomes a bottleneck.
157  return starts_with( to_lower( text ), to_lower( prefix ));
158 }
159 
160 bool starts_with_ci( std::string const& text, std::string const& prefix, std::string& suffix )
161 {
162  auto const res = starts_with_ci( text, prefix );
163  if( res ) {
164  assert( prefix.size() <= text.size() );
165  suffix = text.substr( prefix.size() );
166  }
167  return res;
168 }
169 
170 bool starts_with_ci_alnum( std::string const& text, std::string const& prefix )
171 {
172  // Ignore the suffix
173  std::string suffix;
174  return starts_with_ci_alnum( text, prefix, suffix );
175 }
176 
178  std::string const& text, std::string const& prefix, std::string& suffix, bool trim_suffix
179 ) {
180  // Pattern matching of trying to find the prefix in the text,
181  // ignorning all non-alnum characters, and case-insensitive.
182  // For example, we want prefix == "refcnt" to match text == "REF_CNT.S2-1.sorted:1",
183  // and then return the remainder "S2-1.sorted:1" as our suffix.
184  size_t p = 0;
185  size_t t = 0;
186  while( p < prefix.size() && t < text.size() ) {
187  // assert( is_graph(prefix[p]) );
188  // assert( is_graph(text[t]) );
189 
190  if( ! is_alnum(prefix[p]) ) {
191  ++p;
192  continue;
193  }
194  if( ! is_alnum(text[t]) ) {
195  ++t;
196  continue;
197  }
198  if( char_match_ci( prefix[p], text[t] )) {
199  ++p;
200  ++t;
201  } else {
202  return false;
203  }
204  }
205  assert( p <= prefix.size() );
206  assert( t <= text.size() );
207 
208  // If our pattern matching reached the end of the text,
209  // there is nothing left in the text that we could use as a suffix.
210  if( t == text.size() ) {
211  suffix = "";
212  return true;
213  }
214 
215  // However, if we are not yet at the end of the text, we must have reached
216  // the end of the search pattern (prefix), as otherwise the loop would not have termined.
217  assert( t < text.size() );
218  assert( p == prefix.size() );
219 
220  // Now we skip all non-alnum chars in the text from where we are, if requested.
221  while( trim_suffix && t < text.size() && ! is_alnum(text[t]) ) {
222  ++t;
223  }
224 
225  // If there is stuff left, that is the suffix that we use.
226  suffix = text.substr(t);
227  return true;
228 }
229 
230 bool ends_with( std::string const& text, std::string const& suffix )
231 {
232  if( suffix.size() > text.size() ) {
233  return false;
234  }
235  return std::equal( suffix.rbegin(), suffix.rend(), text.rbegin() );
236 }
237 
238 bool ends_with( std::string const& text, std::string const& suffix, std::string& prefix )
239 {
240  auto const res = ends_with( text, suffix );
241  if( res ) {
242  assert( suffix.size() <= text.size() );
243  prefix = text.substr( 0, text.size() - suffix.size() );
244  }
245  return res;
246 }
247 
248 bool ends_with_ci( std::string const& text, std::string const& suffix )
249 {
250  // Lazy. Could be more optimized for sure, if this ever becomes a bottleneck.
251  return ends_with( to_lower( text ), to_lower( suffix ));
252 }
253 
254 bool ends_with_ci( std::string const& text, std::string const& suffix, std::string& prefix )
255 {
256  auto const res = ends_with_ci( text, suffix );
257  if( res ) {
258  assert( suffix.size() <= text.size() );
259  prefix = text.substr( 0, text.size() - suffix.size() );
260  }
261  return res;
262 }
263 
264 bool ends_with_ci_alnum( std::string const& text, std::string const& suffix )
265 {
266  // Ignore the prefix
267  std::string prefix;
268  return ends_with_ci_alnum( text, suffix, prefix );
269 }
270 
272  std::string const& text, std::string const& suffix, std::string& prefix, bool trim_prefix
273 ) {
274  // Lazy. Could be more optimized for sure, if this ever becomes a bottleneck.
275  // Find a prefix, by reversing all strings.
276  auto const text_r = std::string( text.rbegin(), text.rend() );
277  auto const suffix_r = std::string( suffix.rbegin(), suffix.rend() );
278  if( starts_with_ci_alnum( text_r, suffix_r, prefix, trim_prefix )) {
279  // If this succeeded, reverse the result again, to make it correct.
280  prefix = std::string( prefix.rbegin(), prefix.rend() );
281  return true;
282  }
283  return false;
284 }
285 
286 bool match_wildcards( std::string const& str, std::string const& pattern )
287 {
288  // Code adapted from https://www.geeksforgeeks.org/wildcard-pattern-matching/
289 
290  // The empty pattern can only match with the empty string
291  if( pattern.empty() ) {
292  return str.empty();
293  }
294 
295  // Lookup table for dynamic programming approach of subproblem solutions, and init to zero.
296  // We use a vec of bool, and a lambda for access as if it was a matrix.
297  auto lookup_ = std::vector<bool>(( str.size() + 1 ) * ( pattern.size() + 1 ), false);
298  auto lookup = [&]( size_t i, size_t j ) -> std::vector<bool>::reference {
299  return lookup_[ i * ( pattern.size() + 1 ) + j ];
300  };
301 
302  // The empty pattern can match with empty string
303  lookup( 0, 0 ) = true;
304 
305  // Only '*' can match with empty string
306  for( size_t j = 1; j <= pattern.size(); j++ ) {
307  if( pattern[j - 1] == '*' ) {
308  lookup( 0, j ) = lookup( 0, j - 1 );
309  }
310  }
311 
312  // Fill the table in bottom-up fashion
313  for( size_t i = 1; i <= str.size(); i++ ) {
314  for( size_t j = 1; j <= pattern.size(); j++ ) {
315  if( pattern[j - 1] == '*' ) {
316 
317  // Two cases if we see a '*':
318  // a) We ignore ‘*’ character and move to next character in the pattern,
319  // i.e., ‘*’ indicates an empty sequence.
320  // b) '*' character matches with ith character in input
321  lookup( i, j ) = lookup( i, j - 1 ) || lookup( i - 1, j );
322 
323  } else if( pattern[j - 1] == '?' || str[i - 1] == pattern[j - 1] )
324 
325  // Current characters are considered as matching in two cases:
326  // (a) current character of pattern is '?'
327  // (b) characters actually match
328  lookup( i, j ) = lookup( i - 1, j - 1 );
329 
330  else {
331 
332  // If characters don't match
333  lookup( i, j ) = false;
334  }
335  }
336  }
337 
338  return lookup( str.size(), pattern.size() );
339 }
340 
341 int compare_natural( std::string const& lhs, std::string const& rhs )
342 {
343  // Implementation inspired by http://www.davekoelle.com/files/alphanum.hpp
344  // Released under the MIT License - https://opensource.org/licenses/MIT
345  // We however heavily modified it, in particular to work with arbitrary runs of digits.
346 
347  // Edge cases of empty strings.
348  if( lhs.empty() || rhs.empty() ) {
349  // Smart! Let's avoid to do all three cases, and instead convert to int (0 or 1):
350  // * lhs empty, but rhs not: 0 - 1 = -1
351  // * rhs empty, but lhs not: 1 - 0 = +1
352  // * both empty: 1 - 1 = 0
353  return static_cast<int>( rhs.empty() ) - static_cast<int>( lhs.empty() );
354  }
355 
356  // We need to switch between modes. Clear semantics instead of just a bool.
357  enum class ParseMode
358  {
359  kString,
360  kNumber
361  };
362  auto mode = ParseMode::kString;
363 
364  // Helper function to parse a string into an unsigned number, quickly.
365  // Advances the given pos while parsing, until either the end of the string or no more digits.
366  // --> not used, as it can only handle numbers up to the size of unsigned long...
367  // auto parse_unsigned_number_ = []( std::string const& str, size_t& pos ){
368  // unsigned long num = 0;
369  // while( pos < str.size() && is_digit( str[pos] )) {
370  // auto dig = str[pos] - '0';
371  // if( num > ( std::numeric_limits<T>::max() - dig ) / 10 ) {
372  // // This is ugly, and a proper solution would be to take string lengths into
373  // // account, but that would probably require to fully load them, and then compare...
374  // throw std::overflow_error( "Numerical overflow in compare_natural()" );
375  // }
376  // num = 10 * num + dig;
377  // ++pos;
378  // }
379  // return num;
380  // };
381 
382  // Iterate positions in the strings.
383  size_t l = 0;
384  size_t r = 0;
385  while( l < lhs.size() && r < rhs.size() ) {
386  if( mode == ParseMode::kString ) {
387 
388  // Iterate as long as there are strings/chars in both.
389  while( l < lhs.size() && r < rhs.size() ) {
390 
391  // Check if these are digits.
392  bool const l_digit = is_digit( lhs[l] );
393  bool const r_digit = is_digit( rhs[r] );
394 
395  // If both are digits, we continue in number mode.
396  if( l_digit && r_digit ) {
397  mode = ParseMode::kNumber;
398  break;
399  }
400 
401  // If only one of them is a digit, we have a result.
402  if( l_digit ) {
403  return -1;
404  }
405  if( r_digit ) {
406  return +1;
407  }
408 
409  // Neither is a digit, so compare as ASCII chars; if they differ, we have a result.
410  assert( ! l_digit && ! r_digit );
411  int const diff = static_cast<int>( lhs[l] ) - static_cast<int>( rhs[r] );
412  if( diff != 0 ) {
413  return diff;
414  }
415 
416  // Otherwise, process the next character.
417  ++l;
418  ++r;
419  }
420 
421  } else {
422  assert( mode == ParseMode::kNumber );
423 
424  // Here, a first idea was to parse both strings as numbers for as long as they contain
425  // digits, and then compare the resulting numbers. However, this overflows for larger
426  // numbers, and we can easily avoid that by an equally simple solution. We might need
427  // to iterate the digits twice, but save the effort of actually building the numbers!
428  // (see above parse_unsigned_number_() for the parsing function that we first had)
429 
430  // Parse the strings as long as they contain digits, advancing helper indices here.
431  size_t ld = l;
432  size_t rd = r;
433  while( ld < lhs.size() && is_digit( lhs[ld] )) {
434  ++ld;
435  }
436  while( rd < rhs.size() && is_digit( rhs[rd] )) {
437  ++rd;
438  }
439 
440  // If the lengths of digit runs differ, one of them is a larger number than the other.
441  // In that case, we have a result.
442  if( ld != rd ) {
443  return static_cast<int>( ld ) - static_cast<int>( rd );
444  }
445 
446  // If those numbers are the same length, we need to iterate again,
447  // and check digit by digit. Iterate as long as there are digits in both.
448  while( l < lhs.size() && r < rhs.size() ) {
449 
450  // Check if these are digits.
451  bool const l_digit = is_digit( lhs[l] );
452  bool const r_digit = is_digit( rhs[r] );
453 
454  // If there are no more digits, we continue in string mode.
455  if( ! l_digit || ! r_digit ) {
456  // In that case, both have to be not digits, as we just checked same length
457  // of the digit run, and both have to be the same as our previous iteration.
458  assert( ! l_digit && ! r_digit );
459  assert( ld == rd && l == ld && r == rd );
460  mode = ParseMode::kString;
461  break;
462  }
463 
464  // Compare the digits as ASCII chars; if they differ, we have a result.
465  assert( l_digit && r_digit );
466  int const diff = static_cast<int>( lhs[l] ) - static_cast<int>( rhs[r] );
467  if( diff != 0 ) {
468  return diff;
469  }
470 
471  // Otherwise, process the next character.
472  ++l;
473  ++r;
474  }
475  }
476  }
477 
478  // Lastly, if we are here, both strings are identical up to the point to which the were compared.
479  // So now, remaining lenghts checks. Only if everything is identical, return 0.
480  if( l < lhs.size() ) {
481  assert( r == rhs.size() );
482  return +1;
483  }
484  if( r < rhs.size() ) {
485  assert( l == lhs.size() );
486  return -1;
487  }
488  assert( l == lhs.size() && r == rhs.size() );
489  return 0;
490 }
491 
492 // =================================================================================================
493 // Substrings
494 // =================================================================================================
495 
496 std::string head( std::string const& text, size_t lines )
497 {
498  // Not totally efficient, but works for now.
499  auto vec = split( text, "\n", false );
500  size_t remove = vec.size() > lines ? vec.size() - lines : 0;
501  vec.erase( vec.end() - remove, vec.end() );
502  return join( vec, "\n" );
503 }
504 
505 std::string tail( std::string const& text, size_t lines )
506 {
507  // Not totally efficient, but works for now.
508  auto vec = split( text, "\n", false );
509  size_t remove = vec.size() > lines ? vec.size() - lines : 0;
510  vec.erase( vec.begin(), vec.begin() + remove );
511  return join( vec, "\n" );
512 }
513 
514 // =================================================================================================
515 // Split and Count
516 // =================================================================================================
517 
518 size_t count_substring_occurrences( std::string const& str, std::string const& sub )
519 {
520  if (sub.length() == 0) {
521  return 0;
522  }
523 
524  size_t count = 0;
525  for(
526  size_t offset = str.find(sub);
527  offset != std::string::npos;
528  offset = str.find( sub, offset + 1 )
529  ) {
530  ++count;
531  }
532 
533  return count;
534 }
535 
539 static std::vector<std::string> split_ (
540  std::string const& string,
541  std::function<size_t ( std::string const&, size_t )> find_pos,
542  size_t advance_by,
543  const bool trim_empty
544 ) {
545  std::vector<std::string> result;
546 
547  size_t last_pos = 0;
548  while( true ) {
549  // Find first matching char.
550  size_t pos = find_pos( string, last_pos );
551 
552  // If not found, push back rest and stop.
553  if( pos == std::string::npos ) {
554  pos = string.length();
555 
556  if( pos != last_pos || !trim_empty ) {
557  result.push_back( std::string( string.data() + last_pos, pos - last_pos ));
558  }
559 
560  break;
561 
562  // If found, push back and continue.
563  } else {
564  if( pos != last_pos || !trim_empty ) {
565  result.push_back( std::string( string.data() + last_pos, pos - last_pos ));
566  }
567  }
568 
569  last_pos = pos + advance_by;
570  }
571 
572  return result;
573 }
574 
575 std::vector<std::string> split (
576  std::string const& str,
577  char delimiter,
578  const bool trim_empty
579 ) {
580  return split( str, std::string( 1, delimiter ), trim_empty );
581 }
582 
583 std::vector<std::string> split (
584  std::string const& str,
585  std::string const& delimiters,
586  const bool trim_empty
587 ) {
588  return split_(
589  str,
590  [&]( std::string const& str, size_t last_pos ){
591  return str.find_first_of( delimiters, last_pos );
592  },
593  1,
594  trim_empty
595  );
596 }
597 
598 std::vector<std::string> split (
599  std::string const& str,
600  std::function<bool(char)> delimiter_predicate,
601  const bool trim_empty
602 ) {
603  return split_(
604  str,
605  [&]( std::string const& str, size_t last_pos ){
606  // Find first matching char.
607  size_t pos = std::string::npos;
608  for( size_t i = last_pos; i < str.size(); ++i ) {
609  if( delimiter_predicate( str[i] ) ) {
610  pos = i;
611  break;
612  }
613  }
614  return pos;
615  },
616  1,
617  trim_empty
618  );
619 }
620 
621 std::vector<std::string> split_at (
622  std::string const& str,
623  std::string const& delimiter,
624  const bool trim_empty
625 ) {
626  return split_(
627  str,
628  [&]( std::string const& str, size_t last_pos ){
629  return str.find( delimiter, last_pos );
630  },
631  delimiter.size(),
632  trim_empty
633  );
634 }
635 
636 std::vector<size_t> split_range_list( std::string const& str )
637 {
638  std::vector<size_t> result;
639 
640  auto is_digits = []( std::string const& s ){
641  return trim( s ).find_first_not_of( "0123456789" ) == std::string::npos;
642  };
643 
644  auto get_number = []( std::string const& s ){
645  size_t n;
646  sscanf( trim( s ).c_str(), "%zu", &n );
647  return n;
648  };
649 
650  if( trim( str ).empty() ) {
651  return result;
652  }
653 
654  auto const lst = split( str, "," );
655  for( auto const& le : lst ) {
656  // if just digits, done. if not, split -, repeat.
657  if( is_digits( le ) ) {
658  result.push_back( get_number( le ));
659  } else {
660  auto const rng = split( le, "-" );
661  if( rng.size() != 2 || ! is_digits( rng[0] ) || ! is_digits( rng[1] ) ) {
662  throw std::runtime_error( "Invalid range list string." );
663  }
664  auto const b = get_number( rng[0] );
665  auto const e = get_number( rng[1] );
666  for( size_t i = b; i <= e; ++i ) {
667  result.push_back( i );
668  }
669  }
670  }
671 
672  std::sort( result.begin(), result.end() );
673  return result;
674 }
675 
676 // =================================================================================================
677 // Manipulate
678 // =================================================================================================
679 
680 std::string wrap(
681  std::string const& text,
682  size_t line_length
683 ) {
684  /*
685  Code is adapted from https://www.rosettacode.org/wiki/Word_wrap#C.2B.2B,
686  which is published under the GNU Free Documentation License 1.2,
687  see https://www.gnu.org/licenses/old-licenses/fdl-1.2.html
688  We fixed the handling of overly long words,
689  and added correct handling of new lines in the text.
690  It is totally inefficient, but we only need this function for small texts anyway,
691  so for now, this is good enough.
692  */
693 
694  std::ostringstream output;
695  auto const lines = split( text, "\n", false );
696  for( auto const& line : lines ) {
697  std::istringstream text_stream( line );
698  std::string word;
699 
700  if( text_stream >> word ) {
701  output << word;
702  long space_left = static_cast<long>( line_length ) - static_cast<long>( word.length() );
703  while( text_stream >> word ) {
704  if( space_left < static_cast<long>( word.length() + 1 )) {
705  output << "\n" << word;
706  space_left = line_length - word.length();
707  } else {
708  output << " " << word;
709  space_left -= word.length() + 1;
710  }
711  }
712  }
713  output << "\n";
714  }
715 
716  return output.str();
717 }
718 
719 std::string indent(
720  std::string const& text,
721  std::string const& indentation
722 ) {
723  auto ret = indentation + replace_all( text, "\n", "\n" + indentation );
724  return trim_right( ret, indentation );
725 }
726 
727 std::string replace_all (
728  std::string const& text, std::string const& search, std::string const& replace
729 ) {
730  std::string tmp = text;
731  for (size_t pos = 0; ; pos += replace.length()) {
732  pos = tmp.find(search, pos);
733 
734  if (pos == std::string::npos){
735  break;
736  }
737 
738  tmp.erase(pos, search.length());
739  tmp.insert(pos, replace);
740  }
741  return tmp;
742 }
743 
744 // inline version
745 /*
746 void replace_all(
747  std::string &s, const std::string &search, const std::string &replace
748 ) {
749  for (size_t pos = 0; ; pos += replace.length() ) {
750  pos = s.find(search, pos);
751 
752  if (pos == string::npos)
753  break;
754 
755  s.erase(pos, search.length());
756  s.insert(pos, replace);
757  }
758 }
759 */
760 
761 std::string remove_all(
762  std::string const& text,
763  std::string const& search
764 ) {
765  return replace_all( text, search, "" );
766 }
767 
768 std::string replace_all_chars (
769  std::string const& text,
770  std::string const& search_chars,
771  char replace
772 ) {
773  auto result = text;
774  for( auto& c : result ) {
775  if( search_chars.find( c ) != std::string::npos ) {
776  c = replace;
777  }
778  }
779  return result;
780 }
781 
782 std::string remove_all_chars(
783  std::string const& text,
784  std::string const& search_chars
785 ) {
786  // We do multiple rounds... bit easier for now, and not performance critical.
787  auto result = text;
788  for( size_t i = 0; i < search_chars.length(); ++i ) {
789  result.erase( std::remove( result.begin(), result.end(), search_chars[i] ), result.end() );
790  }
791  return result;
792 }
793 
794 std::string remove_all_non_alnum( std::string const& text )
795 {
796  return remove_all_chars_pred(
797  text, []( char c ){
798  return ! is_alnum(c);
799  }
800  );
801 }
802 
803 std::string trim_right (
804  std::string const& s,
805  std::string const& delimiters
806 ) {
807  auto const pos = s.find_last_not_of(delimiters);
808  if( std::string::npos == pos ) {
809  return "";
810  } else {
811  return s.substr( 0, pos + 1 );
812  }
813 }
814 
815 std::string trim_left (
816  std::string const& s,
817  std::string const& delimiters
818 ) {
819  auto const pos = s.find_first_not_of(delimiters);
820  if( std::string::npos == pos ) {
821  return "";
822  } else {
823  return s.substr(pos);
824  }
825 }
826 
827 std::string trim (
828  std::string const& s,
829  std::string const& delimiters
830 ) {
831  return trim_left(trim_right(s, delimiters), delimiters);
832 }
833 
834 // =================================================================================================
835 // Case Conversion
836 // =================================================================================================
837 
838 #ifdef GENESIS_AVX2
839 
840 inline void toggle_case_ascii_inplace_avx_( std::string& str, char char_a, char char_z )
841 {
842  // We use AVX2 here, which uses 256bit = 32byte. Hence, we move through the string in strides
843  // of 32. Concidentally, the ASCII marker for "upper/lower case" also has the value 32 (0x20),
844  // which might lead to confusion when reading the following code. Be warned.
845 
846  // Fill val_32 with 32x 0x20=32
847  auto static const val_32 = _mm256_set1_epi8( 0x20 );
848 
849  // Fill mask_a with 32x 'a/A', mask_z with 32x 'z/Z'
850  auto const mask_a = _mm256_set1_epi8( char_a );
851  auto const mask_z = _mm256_set1_epi8( char_z );
852 
853  // Loop in increments of 32, which is the AVX vector size in bytes.
854  for( size_t i = 0; i < str.size() / 32 * 32; i += 32 ) {
855  auto reg = _mm256_loadu_si256( reinterpret_cast<__m256i*>( &str[i] ) );
856 
857  // mask_az contains 0x00 where the character is between 'a/A' and 'z/Z', 0xff otherwise.
858  auto mask_az = _mm256_or_si256(
859  _mm256_cmpgt_epi8( mask_a, reg ), _mm256_cmpgt_epi8( reg, mask_z )
860  );
861 
862  // Toggle the upper/lower char bit (0x20), 1 means lower case, 0 means upper case.
863  reg = _mm256_xor_si256( _mm256_andnot_si256( mask_az, val_32 ), reg );
864 
865  _mm256_storeu_si256( reinterpret_cast<__m256i*>( &str[i] ), reg );
866  }
867 
868  // Convert the rest that remains by toggling the upper/lower case bit.
869  for( size_t i = str.size() / 32 * 32; i < str.size(); ++i ) {
870  if( char_a <= str[i] && str[i] <= char_z ){
871  str[i] ^= 0x20;
872  }
873  }
874 }
875 
876 #endif // GENESIS_AVX2
877 
878 void to_lower_ascii_inplace( std::string& str )
879 {
880  #ifdef GENESIS_AVX2
881 
882  // Toggle the ascii case bit for all values between the two mask boundaries.
883  toggle_case_ascii_inplace_avx_( str, 'A', 'Z' );
884 
885  #else // GENESIS_AVX2
886 
887  // Naive implementation that might use compiler-generated vector intrinsics.
888  for( auto& c : str ){
889  c = to_lower(c);
890  }
891 
892  #endif // GENESIS_AVX2
893 }
894 
895 std::string to_lower_ascii( std::string const& str )
896 {
897  auto res = str;
898  to_lower_ascii_inplace( res );
899  return res;
900 }
901 
902 void to_upper_ascii_inplace( std::string& str )
903 {
904  #ifdef GENESIS_AVX2
905 
906  // Toggle the ascii case bit for all values between the two mask boundaries.
907  toggle_case_ascii_inplace_avx_( str, 'a', 'z' );
908 
909  #else // GENESIS_AVX2
910 
911  // Naive implementation that might use compiler-generated vector intrinsics.
912  for( auto& c : str ){
913  c = to_upper(c);
914  }
915 
916  #endif // GENESIS_AVX2
917 }
918 
919 std::string to_upper_ascii( std::string const& str )
920 {
921  auto res = str;
922  to_upper_ascii_inplace( res );
923  return res;
924 }
925 
926 // =================================================================================================
927 // Normalize
928 // =================================================================================================
929 
930 std::string escape( std::string const& text )
931 {
932  static_assert( CHAR_BIT == 8, "CHAR_BIT != 8" );
933  std::string tmp;
934  for( auto c : text ) {
935  if( ' ' <= c && c <= '~' && c != '\\' && c != '"') {
936  tmp += c;
937  } else {
938  tmp += '\\';
939  switch( c ) {
940  case '"': tmp += '"'; break;
941  case '\\': tmp += '\\'; break;
942  case '\t': tmp += 't'; break;
943  case '\r': tmp += 'r'; break;
944  case '\n': tmp += 'n'; break;
945  default: {
946  // Without any encoding, we simply opt for a hex representation of the value.
947  char const* const hexdig = "0123456789ABCDEF";
948  tmp += 'x';
949  tmp += hexdig[c >> 4];
950  tmp += hexdig[c & 0xF];
951  }
952  }
953  }
954  }
955  return tmp;
956 }
957 
958 std::string deescape( std::string const& text )
959 {
960  // Prepare a string that might be a bit too big, but saves reallocation.
961  std::string tmp;
962  tmp.reserve( text.size() );
963 
964  // Copy from text to tmp string, while deescaping.
965  for( size_t i = 0; i < text.size(); ++i ) {
966  if( text[ i ] == '\\' ) {
967  if( i + 1 >= text.size() ){
968  break;
969  }
970 
971  tmp += deescape( text[ i + 1 ] );
972  ++i;
973  } else {
974  tmp += text[ i ];
975  }
976  }
977  return tmp;
978 }
979 
980 char deescape( char c )
981 {
982  switch( c ) {
983  case 'r' :
984  return '\r';
985 
986  case 'n' :
987  return '\n';
988 
989  case 't' :
990  return '\t';
991 
992  default :
993  return c;
994  }
995 }
996 
997 // =================================================================================================
998 // Output
999 // =================================================================================================
1000 
1001 std::string repeat( std::string const& word, size_t times )
1002 {
1003  // Init and avoid repeated reallocation.
1004  std::string result;
1005  result.reserve( times * word.length() );
1006 
1007  // Concat repeats.
1008  for( size_t i = 0; i < times; ++i ) {
1009  result += word;
1010  }
1011  return result;
1012 }
1013 
1014 std::string to_string_leading_zeros( size_t value, size_t length )
1015 {
1016  std::stringstream ss;
1017  ss << std::setw( length ) << std::setfill( '0' ) << value;
1018  return ss.str();
1019 }
1020 
1021 std::string to_string_precise( double const value, int const precision )
1022 {
1023  // Simple and straight forward.
1024  std::ostringstream s;
1025  s << std::fixed << std::setprecision( precision ) << value;
1026  return s.str();
1027 }
1028 
1029 std::string to_string_rounded( double const value, int const precision )
1030 {
1031  // Get fixed precision string.
1032  std::ostringstream s;
1033  s << std::fixed << std::setprecision( precision ) << value;
1034  auto str = s.str();
1035 
1036  // Truncate trailing zeros, unless there are only zeros after the decimal point. Then, also
1037  // delete the decimal point.
1038  size_t offset = 1;
1039  size_t const last_nonzero = str.find_last_not_of('0');
1040  if( str[ last_nonzero ] == '.' ) {
1041  offset = 0;
1042  }
1043  str.erase( last_nonzero + offset, std::string::npos );
1044  return str;
1045 }
1046 
1047 std::string to_string_byte_format( size_t value )
1048 {
1049  const char* suffixes[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"};
1050  size_t magnitude = 0;
1051  double size = static_cast<double>(value);
1052 
1053  // Loop until we find the fitting order of magnitude.
1054  while( size >= 1024 && magnitude < (sizeof(suffixes) / sizeof(suffixes[0])) - 1 ) {
1055  size /= 1024;
1056  ++magnitude;
1057  }
1058 
1059  std::ostringstream oss;
1060  oss << std::fixed << std::setprecision(2) << size << suffixes[magnitude];
1061  return oss.str();
1062 }
1063 
1064 } // namespace utils
1065 } // namespace genesis
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::utils::deescape
std::string deescape(std::string const &text)
Return a string where backslash-escaped characters are transformed into their respective string form.
Definition: string.cpp:958
genesis::utils::indent
std::string indent(std::string const &text, std::string const &indentation)
Indent each line of text with indentation and return the result.
Definition: string.cpp:719
genesis::utils::trim_right
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces (or any other delimiters).
Definition: string.cpp:803
genesis::utils::tail
std::string tail(std::string const &text, size_t lines)
Return the last lines of the text.
Definition: string.cpp:505
genesis::utils::to_upper_ascii_inplace
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
Definition: string.cpp:902
genesis::utils::equals_ci
bool equals_ci(std::string const &lhs, std::string const &rhs)
Compare two strings, case insensitive.
Definition: string.cpp:114
genesis::utils::replace_all
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string.
Definition: string.cpp:727
genesis::utils::contains_ci
bool contains_ci(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive.
Definition: string.cpp:59
common.hpp
genesis::utils::char_match_ci
constexpr bool char_match_ci(char c1, char c2) noexcept
Return whether two chars are the same, case insensitive, and ASCII-only.
Definition: char.hpp:243
genesis::utils::to_string_rounded
std::string to_string_rounded(double const value, int const precision)
Return a string representation of the input value, using the provided precision value (determining it...
Definition: string.cpp:1029
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces (or any other delimiters).
Definition: string.cpp:827
genesis::utils::to_lower_ascii_inplace
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
Definition: string.cpp:878
genesis::utils::replace_all_chars
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
Definition: string.cpp:768
genesis::utils::to_string_leading_zeros
std::string to_string_leading_zeros(size_t value, size_t length)
Return a string representation of a size_t value with a fixed length, that is, by adding leading zero...
Definition: string.cpp:1014
genesis::utils::starts_with
bool starts_with(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix.
Definition: string.cpp:136
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
genesis::utils::to_string_precise
std::string to_string_precise(double const value, int const precision)
Return a precise string representation of the input value, using the provided precision value (determ...
Definition: string.cpp:1021
genesis::utils::ends_with
bool ends_with(std::string const &text, std::string const &suffix)
Return whether a string ends with another string, i.e., check for a suffix.
Definition: string.cpp:230
genesis::utils::is_alnum
constexpr bool is_alnum(char c) noexcept
Return whether a char is a letter (a-z or A-Z) or a digit (0-9), ASCII-only.
Definition: char.hpp:143
genesis::tree::equal
bool equal(Tree const &lhs, Tree const &rhs, std::function< bool(TreeNode const &, TreeNode const &) > node_comparator, std::function< bool(TreeEdge const &, TreeEdge const &) > edge_comparator)
Compare two trees for equality given binary comparator functionals for their nodes and edges.
Definition: tree/function/operators.cpp:80
genesis::utils::remove_all_chars
std::string remove_all_chars(std::string const &text, std::string const &search_chars)
Remove all occurrences of the search_chars in text.
Definition: string.cpp:782
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::utils::contains_ci_alnum
bool contains_ci_alnum(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive, and ignoring all non-al...
Definition: string.cpp:71
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
string.hpp
Provides some commonly used string utility functions.
genesis::utils::head
std::string head(std::string const &text, size_t lines)
Return the first lines of the text.
Definition: string.cpp:496
genesis::utils::strncasecmp
int strncasecmp(char const *s1, char const *s2, size_t n)
Compares up to n chars of two strings, ignoring case differences.
Definition: string.cpp:90
genesis::utils::strcasecmp
int strcasecmp(char const *s1, char const *s2)
Compares two strings, ignoring case differences.
Definition: string.cpp:84
genesis::utils::equals_ci_alnum
bool equals_ci_alnum(std::string const &lhs, std::string const &rhs)
Compare two strings, case insensitive, and ignoring all non-alphanumerical characters.
Definition: string.cpp:128
genesis::utils::remove_all
std::string remove_all(std::string const &text, std::string const &search)
Return a copy of a string, where all occurrences of a search string are removed.
Definition: string.cpp:761
genesis::utils::compare_natural
int compare_natural(std::string const &lhs, std::string const &rhs)
Compare two strings with natural human sorting, that is "A1", "A2", "A100", instead of the standard s...
Definition: string.cpp:341
genesis::utils::to_upper_ascii
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
Definition: string.cpp:919
genesis::utils::remove_all_chars_pred
std::string remove_all_chars_pred(std::string const &text, UnaryPredicate predicate)
Remove all occurrences characters for which predicate is true in text.
Definition: string.hpp:430
genesis::utils::join
Interval< DataType, NumericalType, IntervalKind > join(Interval< DataType, NumericalType, IntervalKind > const &a, Interval< DataType, NumericalType, IntervalKind > const &b)
Creates a new Interval that contains both intervals and whatever is between.
Definition: utils/containers/interval_tree/functions.hpp:127
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::starts_with_ci_alnum
bool starts_with_ci_alnum(std::string const &text, std::string const &prefix)
Return whether a string starts with another string (prefix), comparing case-independent,...
Definition: string.cpp:170
genesis::utils::starts_with_ci
bool starts_with_ci(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix, case insensitive.
Definition: string.cpp:154
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
genesis::utils::ends_with_ci_alnum
bool ends_with_ci_alnum(std::string const &text, std::string const &suffix)
Return whether a string ends with another string (suffix), comparing case-independent,...
Definition: string.cpp:264
genesis::utils::to_string_byte_format
std::string to_string_byte_format(size_t value)
Produce a human readable formatting of a size in bytes, using the appropriate suffix.
Definition: string.cpp:1047
genesis::utils::to_lower_ascii
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:895
genesis::utils::repeat
std::string repeat(std::string const &word, size_t times)
Take a string and repeat it a given number of times.
Definition: string.cpp:1001
genesis::utils::wrap
std::string wrap(std::string const &text, size_t line_length)
Wrap a text at a given line_length.
Definition: string.cpp:680
genesis::utils::split_at
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:621
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::utils::match_wildcards
bool match_wildcards(std::string const &str, std::string const &pattern)
Return whether a string is matched by a wildcard pattern containing ? and * for single and mutliple (...
Definition: string.cpp:286
genesis::utils::split_
static std::vector< std::string > split_(std::string const &string, std::function< size_t(std::string const &, size_t)> find_pos, size_t advance_by, const bool trim_empty)
Local function that does the work for the split cuntions.
Definition: string.cpp:539
genesis::utils::trim_left
std::string trim_left(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with right trimmed white spaces (or any other delimiters).
Definition: string.cpp:815
genesis::utils::count_substring_occurrences
size_t count_substring_occurrences(std::string const &str, std::string const &sub)
Return the number of (possibly overlapping) occurrences of a substring in a string.
Definition: string.cpp:518
genesis::utils::remove_all_non_alnum
std::string remove_all_non_alnum(std::string const &text)
Remove all non-alphanumerical characters from a string.
Definition: string.cpp:794
genesis::utils::split_range_list
std::vector< size_t > split_range_list(std::string const &str)
Split a string containing positive interger numbers into its parts and resolve ranges.
Definition: string.cpp:636
genesis::utils::ends_with_ci
bool ends_with_ci(std::string const &text, std::string const &suffix)
Return whether a string ends with another string, i.e., check for a suffix, case insensitive.
Definition: string.cpp:248
genesis::utils::escape
std::string escape(std::string const &text)
Return a string where special chars are replaces by their escape sequence.
Definition: string.cpp:930