A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
string.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <algorithm>
36 #include <cctype>
37 #include <cstdio>
38 #include <iomanip>
39 #include <sstream>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace utils {
44 
45 // =================================================================================================
46 // Compare
47 // =================================================================================================
48 
49 bool contains_ci( std::vector<std::string> const& haystack, std::string const& needle )
50 {
51  auto const l_needle = to_lower( needle );
52  for( auto const& val : haystack ) {
53  if( to_lower( val ) == l_needle ) {
54  return true;
55  }
56  }
57  return false;
58 }
59 
60 bool equals_ci( std::string const& lhs, std::string const& rhs)
61 {
62  const size_t sz = lhs.size();
63  if( rhs.size() != sz ) {
64  return false;
65  }
66  for( size_t i = 0; i < sz; ++i ) {
67  if( tolower( lhs[i] ) != tolower( rhs[i] ) ) {
68  return false;
69  }
70  }
71  return true;
72 }
73 
74 bool starts_with( std::string const & text, std::string const & start )
75 {
76  if (start.size() > text.size()) {
77  return false;
78  }
79  return std::equal( start.begin(), start.end(), text.begin() );
80 }
81 
82 bool ends_with( std::string const & text, std::string const & ending )
83 {
84  if (ending.size() > text.size()) {
85  return false;
86  }
87  return std::equal( ending.rbegin(), ending.rend(), text.rbegin() );
88 }
89 
90 // =================================================================================================
91 // Substrings
92 // =================================================================================================
93 
94 std::string head( std::string const& text, size_t lines )
95 {
96  // Not totally efficient, but works for now.
97  auto vec = split( text, "\n", false );
98  size_t remove = vec.size() > lines ? vec.size() - lines : 0;
99  vec.erase( vec.end() - remove, vec.end() );
100  return join( vec, "\n" );
101 }
102 
103 std::string tail( std::string const& text, size_t lines )
104 {
105  // Not totally efficient, but works for now.
106  auto vec = split( text, "\n", false );
107  size_t remove = vec.size() > lines ? vec.size() - lines : 0;
108  vec.erase( vec.begin(), vec.begin() + remove );
109  return join( vec, "\n" );
110 }
111 
112 // =================================================================================================
113 // Find and Count
114 // =================================================================================================
115 
116 size_t count_substring_occurrences( std::string const& str, std::string const& sub )
117 {
118  if (sub.length() == 0) {
119  return 0;
120  }
121 
122  size_t count = 0;
123  for(
124  size_t offset = str.find(sub);
125  offset != std::string::npos;
126  offset = str.find( sub, offset + 1 )
127  ) {
128  ++count;
129  }
130 
131  return count;
132 }
133 
137 std::vector<std::string> split (
138  std::string const& string,
139  std::function<size_t ( std::string const&, size_t )> find_pos,
140  size_t advance_by,
141  const bool trim_empty
142 ) {
143  size_t pos;
144  size_t last_pos = 0;
145 
146  std::vector<std::string> result;
147 
148  while( true ) {
149  // Find first matching char.
150  pos = find_pos( string, last_pos );
151 
152  // If not found, push back rest and stop.
153  if( pos == std::string::npos ) {
154  pos = string.length();
155 
156  if( pos != last_pos || !trim_empty ) {
157  result.push_back( std::string( string.data() + last_pos, pos - last_pos ));
158  }
159 
160  break;
161 
162  // If found, push back and continue.
163  } else {
164  if( pos != last_pos || !trim_empty ) {
165  result.push_back( std::string( string.data() + last_pos, pos - last_pos ));
166  }
167  }
168 
169  last_pos = pos + advance_by;
170  }
171 
172  return result;
173 }
174 
175 std::vector<std::string> split (
176  std::string const& str,
177  std::string const& delimiters,
178  const bool trim_empty
179 ) {
180  return split(
181  str,
182  [&]( std::string const& str, size_t last_pos ){
183  return str.find_first_of( delimiters, last_pos );
184  },
185  1,
186  trim_empty
187  );
188 }
189 
190 std::vector<std::string> split (
191  std::string const& str,
192  std::function<bool(char)> delimiter_predicate,
193  const bool trim_empty
194 ) {
195  return split(
196  str,
197  [&]( std::string const& str, size_t last_pos ){
198  // Find first matching char.
199  size_t pos = std::string::npos;
200  for( size_t i = last_pos; i < str.size(); ++i ) {
201  if( delimiter_predicate( str[i] ) ) {
202  pos = i;
203  break;
204  }
205  }
206  return pos;
207  },
208  1,
209  trim_empty
210  );
211 }
212 
213 std::vector<std::string> split_at (
214  std::string const& str,
215  std::string const& delimiter,
216  const bool trim_empty
217 ) {
218  return split(
219  str,
220  [&]( std::string const& str, size_t last_pos ){
221  return str.find( delimiter, last_pos );
222  },
223  delimiter.size(),
224  trim_empty
225  );
226 }
227 
228 std::vector<size_t> split_range_list( std::string const& str )
229 {
230  std::vector<size_t> result;
231 
232  auto is_digits = []( std::string const& s ){
233  return trim( s ).find_first_not_of( "0123456789" ) == std::string::npos;
234  };
235 
236  auto get_number = []( std::string const& s ){
237  size_t n;
238  sscanf( trim( s ).c_str(), "%zu", &n );
239  return n;
240  };
241 
242  if( trim( str ).empty() ) {
243  return result;
244  }
245 
246  auto const lst = split( str, "," );
247  for( auto const& le : lst ) {
248  // if just digits, done. if not, split -, repeat.
249  if( is_digits( le ) ) {
250  result.push_back( get_number( le ));
251  } else {
252  auto const rng = split( le, "-" );
253  if( rng.size() != 2 || ! is_digits( rng[0] ) || ! is_digits( rng[1] ) ) {
254  throw std::runtime_error( "Invalid range list string." );
255  }
256  auto const b = get_number( rng[0] );
257  auto const e = get_number( rng[1] );
258  for( size_t i = b; i <= e; ++i ) {
259  result.push_back( i );
260  }
261  }
262  }
263 
264  std::sort( result.begin(), result.end() );
265  return result;
266 }
267 
268 // =================================================================================================
269 // Manipulate
270 // =================================================================================================
271 
272 std::string indent(
273  std::string const& text,
274  std::string const& indentation
275 ) {
276  auto ret = indentation + replace_all( text, "\n", "\n" + indentation );
277  return trim_right( ret, indentation );
278 }
279 
280 std::string replace_all (
281  std::string const& text, std::string const& search, std::string const& replace
282 ) {
283  std::string tmp = text;
284  for (size_t pos = 0; ; pos += replace.length()) {
285  pos = tmp.find(search, pos);
286 
287  if (pos == std::string::npos){
288  break;
289  }
290 
291  tmp.erase(pos, search.length());
292  tmp.insert(pos, replace);
293  }
294  return tmp;
295 }
296 
297 // inline version
298 /*
299 void replace_all(
300  std::string &s, const std::string &search, const std::string &replace
301 ) {
302  for (size_t pos = 0; ; pos += replace.length() ) {
303  pos = s.find(search, pos);
304 
305  if (pos == string::npos)
306  break;
307 
308  s.erase(pos, search.length());
309  s.insert(pos, replace);
310  }
311 }
312 */
313 
314 std::string replace_all_chars (
315  std::string const& text,
316  std::string const& search_chars,
317  char replace
318 ) {
319  auto result = text;
320  for( auto& c : result ) {
321  if( search_chars.find( c ) != std::string::npos ) {
322  c = replace;
323  }
324  }
325  return result;
326 }
327 
328 std::string trim_right (
329  std::string const& s,
330  std::string const& delimiters
331 ) {
332  auto const pos = s.find_last_not_of(delimiters);
333  if( std::string::npos == pos ) {
334  return "";
335  } else {
336  return s.substr( 0, pos + 1 );
337  }
338 }
339 
340 std::string trim_left (
341  std::string const& s,
342  std::string const& delimiters
343 ) {
344  auto const pos = s.find_first_not_of(delimiters);
345  if( std::string::npos == pos ) {
346  return "";
347  } else {
348  return s.substr(pos);
349  }
350 }
351 
352 std::string trim (
353  std::string const& s,
354  std::string const& delimiters
355 ) {
356  return trim_left(trim_right(s, delimiters), delimiters);
357 }
358 
359 // =================================================================================================
360 // Normalize
361 // =================================================================================================
362 
363 std::string escape( std::string const& text )
364 {
365  // This is slow-ish, because the string is iterated multiple times. Could be done faster.
366  std::string tmp;
367  tmp = replace_all( text, "\r", "\\r" );
368  tmp = replace_all( tmp, "\n", "\\n" );
369  tmp = replace_all( tmp, "\t", "\\t" );
370  tmp = replace_all( tmp, "\"", "\\\"" );
371  tmp = replace_all( tmp, "\\", "\\\\" );
372  return tmp;
373 }
374 
375 std::string deescape( std::string const& text )
376 {
377  // Prepare a string that might be a bit too big, but saves reallocation.
378  std::string tmp;
379  tmp.reserve( text.size() );
380 
381  // Copy from text to tmp string, while deescaping.
382  for( size_t i = 0; i < text.size(); ++i ) {
383  if( text[ i ] == '\\' ) {
384  if( i + 1 >= text.size() ){
385  break;
386  }
387 
388  tmp += deescape( text[ i + 1 ] );
389  ++i;
390  } else {
391  tmp += text[ i ];
392  }
393  }
394  return tmp;
395 }
396 
397 char deescape( char c )
398 {
399  switch( c ) {
400  case 'r' :
401  return '\r';
402 
403  case 'n' :
404  return '\n';
405 
406  case 't' :
407  return '\t';
408 
409  default :
410  return c;
411  }
412 }
413 
414 // =================================================================================================
415 // Output
416 // =================================================================================================
417 
418 std::string repeat( std::string const& word, size_t times )
419 {
420  // Init and avoid repeated reallocation.
421  std::string result;
422  result.reserve( times * word.length() );
423 
424  // Concat repeats.
425  for( size_t i = 0; i < times; ++i ) {
426  result += word ;
427  }
428  return result;
429 }
430 
431 std::string to_string_leading_zeros( size_t value, size_t length )
432 {
433  std::stringstream ss;
434  ss << std::setw( length ) << std::setfill( '0' ) << value;
435  return ss.str();
436 }
437 
438 std::string to_string_precise( double const value, int const precision )
439 {
440  // Simple and straight forward.
441  std::ostringstream s;
442  s << std::fixed << std::setprecision( precision ) << value;
443  return s.str();
444 }
445 
446 std::string to_string_rounded( double const value, int const precision )
447 {
448  // Get fixed precision string.
449  std::ostringstream s;
450  s << std::fixed << std::setprecision( precision ) << value;
451  auto str = s.str();
452 
453  // Truncate trailing zeros, unless there are only zeros after the decimal point. Then, also
454  // delete the decimal point.
455  size_t offset = 1;
456  size_t const last_nonzero = str.find_last_not_of('0');
457  if( str[ last_nonzero ] == '.' ) {
458  offset = 0;
459  }
460  str.erase( last_nonzero + offset, std::string::npos );
461  return str;
462 }
463 
464 } // namespace utils
465 } // namespace genesis
void offset(Histogram &h, double value)
Definition: operations.cpp:47
size_t count_substring_occurrences(std::string const &str, std::string const &sub)
Return the number of (possibly overlapping) occurrences of a substring in a string.
Definition: string.cpp:116
std::string to_string_rounded(double const value, int const precision)
Return a string representation of the input value, using the provided precision value (determining it...
Definition: string.cpp:446
bool starts_with(std::string const &text, std::string const &start)
Return whether a string starts with another string.
Definition: string.cpp:74
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:352
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:328
std::string deescape(std::string const &text)
Return a string where backslash-escaped characters are transformed into their respective string form...
Definition: string.cpp:375
std::string join(T const &v, std::string const &delimiter)
Return a string where the elements of a container v are joined using the string delimiter in between ...
Definition: string.hpp:399
std::string head(std::string const &text, size_t lines)
Return the first lines of the text.
Definition: string.cpp:94
std::string repeat(std::string const &word, size_t times)
Take a string and repeat it a given number of times.
Definition: string.cpp:418
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string...
Definition: string.cpp:280
std::vector< std::string > split(std::string const &string, std::function< size_t(std::string const &, size_t)> find_pos, size_t advance_by, const bool trim_empty)
Local function that does the work for the split cuntions.
Definition: string.cpp:137
std::string to_lower(std::string const &str)
Return an all-lowercase copy of the given string, locale-aware.
Definition: string.hpp:206
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
Definition: string.cpp:314
std::vector< size_t > split_range_list(std::string const &str)
Split a string containing positive interger numbers into its parts and resolve ranges.
Definition: string.cpp:228
std::string escape(std::string const &text)
Return a string where special chars are replaces by their escape sequence.
Definition: string.cpp:363
std::string tail(std::string const &text, size_t lines)
Return the last lines of the text.
Definition: string.cpp:103
std::string indent(std::string const &text, std::string const &indentation)
Indent each line of text with indentation and return the result.
Definition: string.cpp:272
bool equals_ci(std::string const &lhs, std::string const &rhs)
Compare two strings case insensitive.
Definition: string.cpp:60
std::string to_string_precise(double const value, int const precision)
Return a precise string representation of the input value, using the provided precision value (determ...
Definition: string.cpp:438
std::string to_string_leading_zeros(size_t value, size_t length)
Return a string representation of a size_t value with a fixed length, that is, by adding leading zero...
Definition: string.cpp:431
std::string trim_left(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with right trimmed white spaces.
Definition: string.cpp:340
Provides some commonly used string utility functions.
bool ends_with(std::string const &text, std::string const &ending)
Return whether a string ends with another string.
Definition: string.cpp:82
std::vector< std::string > split_at(std::string const &str, std::string const &delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter string.
Definition: string.cpp:213
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
bool equal(Tree const &lhs, Tree const &rhs, std::function< bool(TreeNode const &, TreeNode const &) > node_comparator, std::function< bool(TreeEdge const &, TreeEdge const &) > edge_comparator)
Compares two trees for equality given binary comparator functionals for their nodes and edges...
bool contains_ci(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive.
Definition: string.cpp:49