A library for working with phylogenetic and population genetic data.
v0.32.0
parser.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_IO_PARSER_H_
2 #define GENESIS_UTILS_IO_PARSER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
36 
37 #include <cassert>
38 #include <cmath>
39 #include <cctype>
40 #include <cstring>
41 #include <limits>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace utils {
46 
47 // =================================================================================================
48 // Integer
49 // =================================================================================================
50 
58 size_t parse_unsigned_integer_size_t( utils::InputStream& source );
59 
67 template<class T>
69 {
70  // No need to assert unsignedness here. We will later check that casting to the desired
71  // type worked, and we test for the correct sign there as well, so that it workes for
72  // signed types.
73  // static_assert(
74  // std::is_unsigned<T>::value,
75  // "Need unsigned type for parse_unsigned_integer()"
76  // );
77 
78  auto const x = parse_unsigned_integer_size_t( source );
79 
80  // We parsed as largest, and now try to cast to desired type,
81  // testing that back-conversion gives the same value and correct sign.
82  auto const r = static_cast<T>(x);
83  if( static_cast<size_t>(r) != x || r < 0 ) {
84  throw std::overflow_error(
85  "Numerical overflow in " + source.source_name() + " at " + source.at() + "."
86  );
87  }
88  return r;
89 }
90 
100 template<class T>
102 {
103  static_assert(
104  std::is_signed<T>::value,
105  "Need signed type for parse_signed_integer()"
106  );
107 
108  if( !source ) {
109  throw std::runtime_error(
110  "Expecting number in " + source.source_name() + " at " + source.at() + "."
111  );
112  }
113 
114  int sign = 1;
115  if( *source == '-' || *source == '+' ) {
116  if( *source == '-' ) {
117  sign = -1;
118  }
119  ++source;
120  }
121 
122  // Parse as largest, but report overflow as underflow if negative.
123  size_t x;
124  try {
125  x = parse_unsigned_integer_size_t( source );
126  } catch( std::overflow_error const& ) {
127  if( sign == -1 ) {
128  throw std::underflow_error(
129  "Numerical overflow in " + source.source_name() + " at " + source.at() + "."
130  );
131  } else {
132  throw;
133  }
134  }
135 
136  // Lastly, try to cast to desired type, testing that back-conversion
137  // gives the same value and sign.
138  // The back-cast of `sign * r` is always valid, as the negative range of signed ints
139  // is smaller than the positive, so if it's negative, multiplying by -1 will always result
140  // in a valid value.
141  auto const r = static_cast<T>( sign * static_cast<T>(x) );
142  if( static_cast<size_t>( sign * r ) != x || !( r == 0 || (sign < 0) == (r < 0) )) {
143  throw std::underflow_error(
144  "Numerical overflow in " + source.source_name() + " at " + source.at() + "."
145  );
146  }
147  return r;
148 }
149 
153 template<class T>
155 {
156  return parse_signed_integer<T>(source);
157 }
158 
159 // =================================================================================================
160 // Float
161 // =================================================================================================
162 
175 template<class T>
177 {
178  T x = 0.0;
179 
180  if( !source ) {
181  throw std::runtime_error(
182  "Expecting float number in " + source.source_name() + " at " + source.at() + "."
183  );
184  }
185 
186  // Sign
187  bool is_neg = false;
188  if( *source == '-' ){
189  is_neg = true;
190  ++source;
191  } else if( *source == '+' ) {
192  ++source;
193  }
194 
195  // Integer Part
196  bool found_mantisse = false;
197  while( source && utils::is_digit( *source )) {
198  int y = *source - '0';
199  x *= 10;
200  x += y;
201  ++source;
202  found_mantisse = true;
203  }
204 
205  // Decimal part
206  if( source && *source == '.' ) {
207  ++source;
208 
209  if( ! source || ! utils::is_digit( *source ) ) {
210  throw std::runtime_error(
211  "Invalid number in " + source.source_name() + " at " + source.at() + "."
212  );
213  }
214 
215  T pos = 1.0;
216  while( source && utils::is_digit( *source )) {
217  pos /= 10.0;
218  int y = *source - '0';
219  x += y * pos;
220  ++source;
221  found_mantisse = true;
222  }
223  }
224 
225  // Special cases.
226  if( ! found_mantisse && source ) {
227  static_assert(
228  std::numeric_limits<double>::is_iec559,
229  "Compiler does not use ISO IEC559 / IEEE754 standard."
230  );
231 
232  // Check against usual special cases. We use value == 0.0 to indicate the default case
233  // (no match), and any other value will be what we found then.
234  double const sign = is_neg ? -1.0 : 1.0;
235  double value = 0.0;
236  auto const buffer = source.buffer();
237  if( buffer.second >= 8 && strncasecmp( buffer.first, "infinity", 8 ) == 0 ) {
238  source.jump_unchecked( 8 );
239  value = sign * std::numeric_limits<double>::infinity();
240  } else if( buffer.second >= 3 && strncasecmp( buffer.first, "inf", 3 ) == 0 ) {
241  source.jump_unchecked( 3 );
242  value = sign * std::numeric_limits<double>::infinity();
243  }
244  if( buffer.second >= 3 && strncasecmp( buffer.first, "nan", 3 ) == 0 ) {
245  source.jump_unchecked( 3 );
246  value = sign * std::numeric_limits<double>::quiet_NaN();
247  }
248 
249  // If we found something, we need to check that this is not part of some other
250  // random longer string. The next char needs to be something non-alnum.
251  // Bit stricter than usual double parsing, but should cover all cases except for
252  // the `nanCHAR` notation. If this check fails, we instead continue,
253  // which will throw in the next check below.
254  if( value != 0.0 && ( !source || ! is_alnum( *source ))) {
255  return value;
256  }
257  }
258 
259  // We need to have some digits before the exponential part.
260  if( ! found_mantisse ) {
261  throw std::runtime_error(
262  "Invalid number in " + source.source_name() + " at " + source.at() + "."
263  );
264  }
265 
266  // Exponential part
267  if( source && utils::to_lower(*source) == 'e' ) {
268  ++source;
269 
270  // Read the exp. If there are no digits, this throws.
271  int e = parse_signed_integer<int>( source );
272 
273  if( e != 0 ) {
274  T base;
275  if( e < 0 ) {
276  base = 0.1;
277  e = -e;
278  } else {
279  base = 10;
280  }
281 
282  while( e != 1 ) {
283  if( ( e & 1 ) == 0 ) {
284  base = base * base;
285  e >>= 1;
286  } else {
287  x *= base;
288  --e;
289  }
290  }
291  x *= base;
292  }
293  }
294 
295  // Sign
296  if (is_neg) {
297  x = -x;
298  }
299 
300  return x;
301 }
302 
303 // =================================================================================================
304 // General Number String
305 // =================================================================================================
306 
317 std::string parse_number_string(
318  utils::InputStream& source
319 );
320 
321 // =================================================================================================
322 // String
323 // =================================================================================================
324 
350 std::string parse_quoted_string(
351  utils::InputStream& source,
352  bool use_escapes = true,
353  bool use_twin_quotes = false,
354  bool include_qmarks = false
355 );
356 
357 } // namespace utils
358 } // namespace genesis
359 
360 #endif // include guard
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:437
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:478
genesis::utils::InputStream::buffer
std::pair< char const *, size_t > buffer()
Direct access to the internal buffer.
Definition: input_stream.hpp:390
genesis::utils::parse_integer
T parse_integer(utils::InputStream &source)
Alias for parse_signed_integer().
Definition: parser.hpp:154
genesis::utils::parse_float
T parse_float(utils::InputStream &source)
Read a floating point number from a stream and return it.
Definition: parser.hpp:176
genesis::utils::is_alnum
constexpr bool is_alnum(char c) noexcept
Return whether a char is a letter (a-z or A-Z) or a digit (0-9), ASCII-only.
Definition: char.hpp:143
input_stream.hpp
genesis::utils::strncasecmp
int strncasecmp(char const *s1, char const *s2, size_t n)
Compares up to n chars of two strings, ignoring case differences.
Definition: string.cpp:90
genesis::utils::parse_unsigned_integer_size_t
size_t parse_unsigned_integer_size_t(utils::InputStream &source)
Parse the input source as an unsigned int into a size_t.
Definition: parser.cpp:453
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::parse_unsigned_integer
T parse_unsigned_integer(utils::InputStream &source)
Read an unsigned integer from a stream and return it.
Definition: parser.hpp:68
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
char.hpp
genesis::utils::parse_signed_integer
T parse_signed_integer(utils::InputStream &source)
Read a signed integer from a stream and return it.
Definition: parser.hpp:101
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::utils::parse_quoted_string
std::string parse_quoted_string(utils::InputStream &source, bool use_escapes, bool use_twin_quotes, bool include_qmarks)
Read a string in quotation marks from a stream and return it.
Definition: parser.cpp:547
genesis::utils::InputStream::jump_unchecked
void jump_unchecked(size_t n)
Jump forward in the stream by a certain amount of chars.
Definition: input_stream.cpp:609
genesis::utils::parse_number_string
std::string parse_number_string(utils::InputStream &source)
Read a general number string from an input stream.
Definition: parser.cpp:479