A library for working with phylogenetic and population genetic data.
v0.32.0
string.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_TEXT_STRING_H_
2 #define GENESIS_UTILS_TEXT_STRING_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
35 
36 #include <algorithm>
37 #include <cctype>
38 #include <functional>
39 #include <iostream>
40 #include <sstream>
41 #include <string>
42 #include <type_traits>
43 #include <vector>
44 
45 namespace genesis {
46 namespace utils {
47 
48 // =================================================================================================
49 // Compare and Find
50 // =================================================================================================
51 
55 bool contains_ci( std::vector<std::string> const& haystack, std::string const& needle );
56 
61 bool contains_ci_alnum( std::vector<std::string> const& haystack, std::string const& needle );
62 
73 int strcasecmp( char const* s1, char const* s2 );
74 
85 int strncasecmp( char const* s1, char const* s2, size_t n );
86 
90 bool equals_ci( std::string const& lhs, std::string const& rhs );
91 
95 bool equals_ci_alnum( std::string const& lhs, std::string const& rhs );
96 
100 bool starts_with( std::string const& text, std::string const& prefix );
101 
109 bool starts_with( std::string const& text, std::string const& prefix, std::string& suffix );
110 
115 bool starts_with_ci( std::string const& text, std::string const& prefix );
116 
123 bool starts_with_ci( std::string const& text, std::string const& prefix, std::string& suffix );
124 
131 bool starts_with_ci_alnum( std::string const& text, std::string const& prefix );
132 
146  std::string const& text,
147  std::string const& prefix,
148  std::string& suffix,
149  bool trim_suffix = false
150 );
151 
155 bool ends_with( std::string const& text, std::string const& suffix );
156 
164 bool ends_with( std::string const& text, std::string const& suffix, std::string& prefix );
165 
170 bool ends_with_ci( std::string const& text, std::string const& suffix );
171 
178 bool ends_with_ci( std::string const& text, std::string const& suffix, std::string& prefix );
179 
186 bool ends_with_ci_alnum( std::string const& text, std::string const& suffix );
187 
200 bool ends_with_ci_alnum(
201  std::string const& text,
202  std::string const& suffix,
203  std::string& prefix,
204  bool trim_prefix = false
205 );
206 
211 bool match_wildcards( std::string const& str, std::string const& pattern );
212 
219 int compare_natural( std::string const& lhs, std::string const& rhs );
220 
224 template <class T = std::string>
225 struct NaturalLess {
226  bool operator()( T const& lhs, T const& rhs ) const {
227  return compare_natural( lhs, rhs ) < 0;
228  }
229 };
230 
234 template <class T = std::string>
236  bool operator()( T const& lhs, T const& rhs ) const {
237  return compare_natural( lhs, rhs ) > 0;
238  }
239 };
240 
245 template <typename RandomAccessIterator>
246 inline void sort_natural(
247  RandomAccessIterator first,
248  RandomAccessIterator last,
249  bool reverse = false
250 ) {
251  // The above implementations of NaturalLess and NaturalGreater were using std::binary_function
252  // before, which is deprecated. We hence now simply removed those, which seems to work.
253  // If that causes trouble with other compilers, see https://stackoverflow.com/a/33115341 for
254  // alternative solutions.
255 
256  using T = typename RandomAccessIterator::value_type;
257  if( reverse ) {
258  std::sort( first, last, NaturalGreater<T>() );
259  } else {
260  std::sort( first, last, NaturalLess<T>() );
261  }
262 }
263 
264 // =================================================================================================
265 // Substrings
266 // =================================================================================================
267 
271 std::string head( std::string const& text, size_t lines = 10 );
272 
276 std::string tail( std::string const& text, size_t lines = 10 );
277 
278 // =================================================================================================
279 // Split and Count
280 // =================================================================================================
281 
285 size_t count_substring_occurrences( std::string const& str, std::string const& sub );
286 
294 std::vector<std::string> split (
295  std::string const& str,
296  char delimiter = '\t',
297  const bool trim_empty = true
298 );
299 
307 std::vector<std::string> split (
308  std::string const& str,
309  std::string const& delimiters,
310  const bool trim_empty = true
311 );
312 
321 std::vector<std::string> split (
322  std::string const& str,
323  std::function<bool (char)> delimiter_predicate,
324  const bool trim_empty = true
325 );
326 
334 std::vector<std::string> split_at (
335  std::string const& str,
336  std::string const& delimiter,
337  const bool trim_empty = true
338 );
339 
346 std::vector<size_t> split_range_list( std::string const& str );
347 
348 // =================================================================================================
349 // Manipulate
350 // =================================================================================================
351 
355 std::string wrap(
356  std::string const& text,
357  size_t line_length = 80
358 );
359 
367 std::string indent(
368  std::string const& text,
369  std::string const& indentation = " "
370 );
371 
376 std::string replace_all(
377  std::string const& text,
378  std::string const& search,
379  std::string const& replace
380 );
381 
385 std::string remove_all(
386  std::string const& text,
387  std::string const& search
388 );
389 
393 std::string replace_all_chars(
394  std::string const& text,
395  std::string const& search_chars,
396  char replace
397 );
398 
403 template< class UnaryPredicate >
405  std::string const& text,
406  UnaryPredicate predicate,
407  char replace
408 ) {
409  auto result = text;
410  for( auto& c : result ) {
411  if( predicate( c ) ) {
412  c = replace;
413  }
414  }
415  return result;
416 }
417 
421 std::string remove_all_chars(
422  std::string const& text,
423  std::string const& search_chars
424 );
425 
429 template< class UnaryPredicate >
431  std::string const& text,
432  UnaryPredicate predicate
433 ) {
434  auto result = text;
435  result.erase( std::remove_if( result.begin(), result.end(), predicate ), result.end() );
436  return result;
437 }
438 
442 std::string remove_all_non_alnum( std::string const& text );
443 
448 std::string trim_right (
449  std::string const& s,
450  std::string const& delimiters = " \f\n\r\t\v"
451 );
452 
457 std::string trim_left (
458  std::string const& s,
459  std::string const& delimiters = " \f\n\r\t\v"
460 );
461 
466 std::string trim (
467  std::string const& s,
468  std::string const& delimiters = " \f\n\r\t\v"
469 );
470 
471 // =================================================================================================
472 // Case Conversion
473 // =================================================================================================
474 
478 inline std::string to_lower( std::string const& str )
479 {
480  auto res = str;
481  for( auto& c : res ){
482  // Weird C relicts need weird conversions...
483  // See https://en.cppreference.com/w/cpp/string/byte/tolower
484  c = static_cast<char>( std::tolower( static_cast<unsigned char>( c )));
485  }
486  return res;
487 }
488 
492 inline void to_lower_inplace( std::string& str )
493 {
494  for( auto& c : str ){
495  c = static_cast<char>( std::tolower( static_cast<unsigned char>( c )));
496  }
497 }
498 
502 inline std::string to_upper( std::string const& str )
503 {
504  auto res = str;
505  for( auto& c : res ){
506  c = static_cast<char>( std::toupper( static_cast<unsigned char>( c )));
507  }
508  return res;
509 }
510 
514 inline void to_upper_inplace( std::string& str )
515 {
516  for( auto& c : str ){
517  c = static_cast<char>( std::toupper( static_cast<unsigned char>( c )));
518  }
519 }
520 
524 void to_lower_ascii_inplace( std::string& str );
525 
529 std::string to_lower_ascii( std::string const& str );
530 
536 void to_upper_ascii_inplace( std::string& str );
537 
541 std::string to_upper_ascii( std::string const& str );
542 
543 // =================================================================================================
544 // Normalize
545 // =================================================================================================
546 
554 std::string escape( std::string const& text );
555 
567 std::string deescape( std::string const& text );
568 
577 char deescape( char c );
578 
579 // =================================================================================================
580 // Output
581 // =================================================================================================
582 
586 std::string repeat( std::string const& word, size_t times );
587 
594 std::string to_string_leading_zeros( size_t value, size_t length = 6 );
595 
607 std::string to_string_precise( double value, int precision = 6 );
608 
620 std::string to_string_rounded( double value, int precision = 6 );
621 
636 template <typename T>
637 std::string to_string_nice( T const& v )
638 {
639  std::ostringstream s;
640  s << v;
641  return s.str();
642 }
643 
649 std::string to_string_byte_format( size_t value );
650 
658 template<typename T>
659 inline std::string to_bit_string(
660  T const x, char const zero = '0', char const one = '1', bool const byte_space = true
661 ) {
662  static_assert(
663  std::is_unsigned<T>::value,
664  "Can only use to_bit_string() with unsigned types."
665  );
666 
667  std::string binary = "";
668  T mask = 1;
669  for( size_t i = 0; i < sizeof(T) * 8; ++i ) {
670  if( byte_space && i > 0 && i % 8 == 0 ) {
671  binary = ' ' + binary;
672  }
673  if( mask & x ) {
674  binary = one + binary;
675  } else {
676  binary = zero + binary;
677  }
678  mask <<= 1;
679  }
680  return binary;
681 }
682 
692 template <
693  typename C,
694  typename std::enable_if<
695  ! std::is_same<typename C::value_type, unsigned char>::value &&
696  ! std::is_same<typename C::value_type, signed char>::value
697  >::type* = nullptr
698 >
699 std::ostream& join( std::ostream& stream, C const& container, std::string const& delimiter = ", " )
700 {
701  for( auto const& element : container ) {
702  if( &element != &(*container.begin()) ) {
703  stream << delimiter;
704  }
705  stream << element;
706  }
707  return stream;
708 }
709 
717 template <
718  typename C,
719  typename std::enable_if<
720  std::is_same<typename C::value_type, unsigned char>::value ||
721  std::is_same<typename C::value_type, signed char>::value
722  >::type* = nullptr
723 >
724 std::ostream& join( std::ostream& stream, C const& container, std::string const& delimiter = ", " )
725 {
726  for( auto const& element : container ) {
727  if( &element != &(*container.begin()) ) {
728  stream << delimiter;
729  }
730  stream << static_cast<int>( element );
731  }
732  return stream;
733 }
734 
739 template <typename C>
740 std::string join( C const& container, std::string const& delimiter = ", " )
741 {
742  std::ostringstream s;
743  join( s, container, delimiter );
744  return s.str();
745 }
746 
747 } // namespace utils
748 } // namespace genesis
749 
750 #endif // include guard
genesis::utils::deescape
std::string deescape(std::string const &text)
Return a string where backslash-escaped characters are transformed into their respective string form.
Definition: string.cpp:958
genesis::utils::indent
std::string indent(std::string const &text, std::string const &indentation)
Indent each line of text with indentation and return the result.
Definition: string.cpp:719
genesis::utils::trim_right
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces (or any other delimiters).
Definition: string.cpp:803
genesis::utils::tail
std::string tail(std::string const &text, size_t lines)
Return the last lines of the text.
Definition: string.cpp:505
genesis::utils::to_upper_ascii_inplace
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
Definition: string.cpp:902
genesis::utils::equals_ci
bool equals_ci(std::string const &lhs, std::string const &rhs)
Compare two strings, case insensitive.
Definition: string.cpp:114
genesis::utils::replace_all_chars_pred
std::string replace_all_chars_pred(std::string const &text, UnaryPredicate predicate, char replace)
Replace all occurrences of characters for which predicate is true in text by the replace char.
Definition: string.hpp:404
genesis::utils::replace_all
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string.
Definition: string.cpp:727
genesis::utils::contains_ci
bool contains_ci(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive.
Definition: string.cpp:59
genesis::utils::to_string_rounded
std::string to_string_rounded(double const value, int const precision)
Return a string representation of the input value, using the provided precision value (determining it...
Definition: string.cpp:1029
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
genesis::utils::to_upper_inplace
void to_upper_inplace(std::string &str)
Turn the given string to all-uppercase, locale-aware.
Definition: string.hpp:514
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces (or any other delimiters).
Definition: string.cpp:827
genesis::utils::sort_natural
void sort_natural(RandomAccessIterator first, RandomAccessIterator last, bool reverse=false)
Sort a range of std::string (or convertible to std::string) elements, using natural sorting; see comp...
Definition: string.hpp:246
genesis::utils::to_lower_ascii_inplace
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
Definition: string.cpp:878
genesis::utils::replace_all_chars
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
Definition: string.cpp:768
genesis::utils::to_string_leading_zeros
std::string to_string_leading_zeros(size_t value, size_t length)
Return a string representation of a size_t value with a fixed length, that is, by adding leading zero...
Definition: string.cpp:1014
genesis::utils::starts_with
bool starts_with(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix.
Definition: string.cpp:136
genesis::utils::to_string_precise
std::string to_string_precise(double const value, int const precision)
Return a precise string representation of the input value, using the provided precision value (determ...
Definition: string.cpp:1021
genesis::utils::ends_with
bool ends_with(std::string const &text, std::string const &suffix)
Return whether a string ends with another string, i.e., check for a suffix.
Definition: string.cpp:230
genesis::utils::NaturalGreater::operator()
bool operator()(T const &lhs, T const &rhs) const
Definition: string.hpp:236
genesis::utils::remove_all_chars
std::string remove_all_chars(std::string const &text, std::string const &search_chars)
Remove all occurrences of the search_chars in text.
Definition: string.cpp:782
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::utils::contains_ci_alnum
bool contains_ci_alnum(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive, and ignoring all non-al...
Definition: string.cpp:71
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::utils::head
std::string head(std::string const &text, size_t lines)
Return the first lines of the text.
Definition: string.cpp:496
genesis::utils::strncasecmp
int strncasecmp(char const *s1, char const *s2, size_t n)
Compares up to n chars of two strings, ignoring case differences.
Definition: string.cpp:90
genesis::utils::strcasecmp
int strcasecmp(char const *s1, char const *s2)
Compares two strings, ignoring case differences.
Definition: string.cpp:84
genesis::utils::equals_ci_alnum
bool equals_ci_alnum(std::string const &lhs, std::string const &rhs)
Compare two strings, case insensitive, and ignoring all non-alphanumerical characters.
Definition: string.cpp:128
genesis::utils::remove_all
std::string remove_all(std::string const &text, std::string const &search)
Return a copy of a string, where all occurrences of a search string are removed.
Definition: string.cpp:761
genesis::utils::compare_natural
int compare_natural(std::string const &lhs, std::string const &rhs)
Compare two strings with natural human sorting, that is "A1", "A2", "A100", instead of the standard s...
Definition: string.cpp:341
genesis::utils::to_upper_ascii
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
Definition: string.cpp:919
genesis::utils::remove_all_chars_pred
std::string remove_all_chars_pred(std::string const &text, UnaryPredicate predicate)
Remove all occurrences characters for which predicate is true in text.
Definition: string.hpp:430
genesis::utils::NaturalGreater
Functor class to compare to strings with natural "human" sorting, see compare_natural().
Definition: string.hpp:235
genesis::utils::join
Interval< DataType, NumericalType, IntervalKind > join(Interval< DataType, NumericalType, IntervalKind > const &a, Interval< DataType, NumericalType, IntervalKind > const &b)
Creates a new Interval that contains both intervals and whatever is between.
Definition: utils/containers/interval_tree/functions.hpp:127
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::starts_with_ci_alnum
bool starts_with_ci_alnum(std::string const &text, std::string const &prefix)
Return whether a string starts with another string (prefix), comparing case-independent,...
Definition: string.cpp:170
genesis::utils::starts_with_ci
bool starts_with_ci(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix, case insensitive.
Definition: string.cpp:154
genesis::utils::ends_with_ci_alnum
bool ends_with_ci_alnum(std::string const &text, std::string const &suffix)
Return whether a string ends with another string (suffix), comparing case-independent,...
Definition: string.cpp:264
genesis::utils::to_string_byte_format
std::string to_string_byte_format(size_t value)
Produce a human readable formatting of a size in bytes, using the appropriate suffix.
Definition: string.cpp:1047
char.hpp
genesis::utils::to_string_nice
std::string to_string_nice(T const &v)
Return a string representation of a given value.
Definition: string.hpp:637
genesis::utils::to_lower_ascii
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:895
genesis::utils::repeat
std::string repeat(std::string const &word, size_t times)
Take a string and repeat it a given number of times.
Definition: string.cpp:1001
genesis::utils::wrap
std::string wrap(std::string const &text, size_t line_length)
Wrap a text at a given line_length.
Definition: string.cpp:680
genesis::utils::NaturalLess::operator()
bool operator()(T const &lhs, T const &rhs) const
Definition: string.hpp:226
genesis::utils::split_at
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:621
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::utils::match_wildcards
bool match_wildcards(std::string const &str, std::string const &pattern)
Return whether a string is matched by a wildcard pattern containing ? and * for single and mutliple (...
Definition: string.cpp:286
genesis::utils::to_bit_string
std::string to_bit_string(T const x, char const zero='0', char const one='1', bool const byte_space=true)
Return the bit representation of an unsigned int.
Definition: string.hpp:659
genesis::utils::trim_left
std::string trim_left(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with right trimmed white spaces (or any other delimiters).
Definition: string.cpp:815
genesis::utils::to_lower_inplace
void to_lower_inplace(std::string &str)
Turn the given string to all-lowercase, locale-aware.
Definition: string.hpp:492
genesis::utils::NaturalLess
Functor class to compare to strings with natural "human" sorting, see compare_natural().
Definition: string.hpp:225
genesis::utils::count_substring_occurrences
size_t count_substring_occurrences(std::string const &str, std::string const &sub)
Return the number of (possibly overlapping) occurrences of a substring in a string.
Definition: string.cpp:518
genesis::utils::remove_all_non_alnum
std::string remove_all_non_alnum(std::string const &text)
Remove all non-alphanumerical characters from a string.
Definition: string.cpp:794
genesis::utils::split_range_list
std::vector< size_t > split_range_list(std::string const &str)
Split a string containing positive interger numbers into its parts and resolve ranges.
Definition: string.cpp:636
genesis::utils::ends_with_ci
bool ends_with_ci(std::string const &text, std::string const &suffix)
Return whether a string ends with another string, i.e., check for a suffix, case insensitive.
Definition: string.cpp:248
genesis::utils::escape
std::string escape(std::string const &text)
Return a string where special chars are replaces by their escape sequence.
Definition: string.cpp:930