A library for working with phylogenetic and population genetic data.
v0.32.0
frequency_table_input_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
44 
45 #include <algorithm>
46 #include <cassert>
47 #include <cstring>
48 #include <limits>
49 #include <stdexcept>
50 
51 namespace genesis {
52 namespace population {
53 
54 // =================================================================================================
55 // Init and Header Parsing
56 // =================================================================================================
57 
58 // -------------------------------------------------------------------------
59 // sample_names
60 // -------------------------------------------------------------------------
61 
62 std::vector<std::string> FrequencyTableInputStream::Iterator::sample_names() const
63 {
64  // We only need this rarely, so we don't need an efficient algorithm for this.
65  // Could be done by storing a vector in the first place, but that would mean that we need to
66  // keep that in sync with the other data objects... So instead we just create the vector here.
67  std::vector<std::string> result;
68  result.resize( header_info_.sample_infos.size() );
69  for( auto const& sample_info : header_info_.sample_infos ) {
70  assert( sample_info.second.index < result.size() );
71  result[ sample_info.second.index ] = sample_info.first;
72  }
73  return result;
74 }
75 
76 // -------------------------------------------------------------------------
77 // parse_header_
78 // -------------------------------------------------------------------------
79 
80 void FrequencyTableInputStream::Iterator::parse_header_()
81 {
82  // Only called when we have a parent.
83  assert( parent_ );
84 
85  // Get a vector of each field in the header line.
86  auto const header_fields = utils::split(
87  input_stream_->get_line(), parent_->separator_char_, false
88  );
89 
90  // We keep a list of all sample names that we found, including the ignored ones,
91  // in order to check that this fits with the given fitler sample name list, just in case.
92  std::unordered_set<std::string> all_samplenames;
93 
94  // Parse all the headers once, in a dry run, to collect information on the samples.
95  // We do this two-pass process in order to be able to correctly size the resulting vectors,
96  // and use pointers to the actual data elements directly, instead of indices, which is likely
97  // to be going to be slower.
98  for( auto const& field : header_fields ) {
99  // Safety.
100  if( field.empty() || ! std::all_of( field.begin(), field.end(), utils::is_graph )) {
101  throw std::runtime_error(
102  "Invalid frequency table with non-graph characters or empty field in header."
103  );
104  }
105 
106  // Do something with the field, depending on what text it contains.
107  parse_header_field_( field, all_samplenames );
108  }
109 
110  // Important checks that we could do as an assertion, but it's kind of what this whole class
111  // relies on, so let's check it all the time.
112  if( column_processors_.size() != header_fields.size() ) {
113  throw std::domain_error(
114  "Internal error: Number of column processors does not match number of columns"
115  );
116  }
117 
118  // Check that all samples have at least two of the fields,
119  // so that we can compute actual counts from them.
120  check_header_fields_( all_samplenames );
121 
122  // If ref and/or alt base columns are not present, we want to use 'N' instead.
123  // As there are no column processors for those in this case, we can just leave the Variant
124  // at it's default, which is 'N' for both, and it will never change. Assert here that we have 'N'
125  assert( current_variant_->reference_base == 'N' );
126  assert( current_variant_->alternative_base == 'N' );
127 
128  // Now resize the intermediate and result data to the number of samples that we are expecting.
129  // The processor lambdas will directly write to these, using indices into them.
130  sample_data_->resize( header_info_.sample_infos.size() );
131  current_variant_->samples.resize( header_info_.sample_infos.size() );
132 }
133 
134 // -------------------------------------------------------------------------
135 // check_header_fields_
136 // -------------------------------------------------------------------------
137 
138 void FrequencyTableInputStream::Iterator::check_header_fields_(
139  std::unordered_set<std::string> const& all_samplenames
140 ) const {
141  // Check that chr and pos are there.
142  // The ref and alt base are not needed, we will just use 'N' instead.
143  if( ! header_info_.has_chr ) {
144  throw std::runtime_error(
145  "Invalid frequency table that does not contain a chromosome column"
146  );
147  }
148  if( ! header_info_.has_pos ) {
149  throw std::runtime_error(
150  "Invalid frequency table that does not contain a position column"
151  );
152  }
153 
154  // Check that we have enough information for each sample to be processable.
155  // We can work with just ref and alt counts, or just the frequency. If either of those is
156  // present, we are good.
157  for( auto const& sample_info_entry : header_info_.sample_infos ) {
158  auto const& sample_info = sample_info_entry.second;
159 
160  // List all valid combinations of data fields that we can handle.
161  // If either of them occurs, we can parse that sample and extract its counts.
162  bool good = sample_info.has_frq;
163  good |= ( sample_info.has_ref && sample_info.has_alt );
164  good |= ( sample_info.has_cov && ( sample_info.has_ref || sample_info.has_alt ));
165  if( ! good ) {
166  throw std::runtime_error(
167  "Frequency table sample \"" + sample_info_entry.first + "\" does not contain enough "
168  "information to compute allele counts."
169  );
170  }
171  }
172 
173  // We also want to warn if not all sample types are the same. That might indicate that
174  // something is not quite right. That is, if one sample has frequencies, but another one
175  // has counts, that seems rather fishy. We can work with this, but might want to tell the user.
176  // To this end, we turn the bools into binary indicator flags, and check for equality.
177  int sample_flags = 0;
178  for( auto const& sample_info_entry : header_info_.sample_infos ) {
179  auto const& sample_info = sample_info_entry.second;
180 
181  // Create the flag. We already checked before that it never is 0, as otherwise, the
182  // previous check in the loop above would not have succeeded.
183  int flag = 0;
184  flag += sample_info.has_ref ? 1 : 0;
185  flag += sample_info.has_alt ? 2 : 0;
186  flag += sample_info.has_frq ? 4 : 0;
187  flag += sample_info.has_cov ? 8 : 0;
188  assert( flag > 0 );
189  if( sample_flags == 0 ) {
190  sample_flags = flag;
191  }
192 
193  // If the flags differ, issue a warning once. We then don't need to further check.
194  if( sample_flags != flag ) {
195  LOG_WARN << "Frequency table samples contain different types of data "
196  << "(reference or alternative counts, frequencies, or read depth). "
197  << "We can handle this, but it might indicate that something went wrong "
198  << "when parsing and interpreting the header fields to obtain sample names.";
199  break;
200  }
201  }
202 
203  // Check that all given sample names for filtering are actually valid names.
204  assert( parent_ );
205  for( auto const& sn : parent_->sample_names_filter_ ) {
206  if( all_samplenames.count( sn ) == 0 ) {
207  throw std::invalid_argument(
208  "Frequency table header does not contain given sample name filter \"" + sn + "\"."
209  );
210  }
211  }
212 }
213 
214 // =================================================================================================
215 // Parse Header Fields
216 // =================================================================================================
217 
218 // -------------------------------------------------------------------------
219 // parse_header_field_
220 // -------------------------------------------------------------------------
221 
222 void FrequencyTableInputStream::Iterator::parse_header_field_(
223  std::string const& field,
224  std::unordered_set<std::string>& all_samplenames
225 ) {
226  // We already checked before, so just assert here.
227  assert( ! field.empty() );
228 
229  // We here have some field given from the table header, and want to figure out what it is.
230  // We try all our types of fields that are supported by this reader: is it
231  // - the chromosome name or the position in the chromosome
232  // - the ref or alt base
233  // - one of numbers of ref/alt counts, read depth, or frequency for a sample
234  // We try to evaluate the field as all types, and see if any of them matches the patterns we
235  // are looking for, and keep track of how many matched (the functions return 1 on success of
236  // matching, and 0 otherwise), to make sure that we only have at most one valid match.
237  // If more than that, the field is ambiguous as far as our pattern matching is concerned,
238  // and in that case, we cannot continue safely.
239  // While matching, the functions also set up the respective parsers for the columns,
240  // so that after this, we have a parser for all columns set up.
241  int matches = 0;
242  matches += evaluate_if_field_is_chr_( field );
243  matches += evaluate_if_field_is_pos_( field );
244  matches += evaluate_if_field_is_ref_( field );
245  matches += evaluate_if_field_is_alt_( field );
246  matches += evaluate_if_field_is_sample_ref_( field, all_samplenames );
247  matches += evaluate_if_field_is_sample_alt_( field, all_samplenames );
248  matches += evaluate_if_field_is_sample_frq_( field, all_samplenames );
249  matches += evaluate_if_field_is_sample_cov_( field, all_samplenames );
250 
251  if( matches == 0 ) {
252  // Field that we could not make sense of. Ignored for now; might add warning later.
253  // In order to ignore it, we need to use an empty processor function that does nothing.
254  auto cur_var = current_variant_;
255  auto const sep_char = parent_->separator_char_;
256  column_processors_.push_back( [cur_var, sep_char]( genesis::utils::InputStream& it ){
258  it, [sep_char]( char c ){
259  return c == '\n' || c == sep_char;
260  }
261  );
262  });
263  }
264  if( matches > 1 ) {
265  // If multiple match, we cannot distinguish between them, and rather throw than cause
266  // unexpected behaviour for the user.
267  throw std::runtime_error(
268  "Cannot read frequency table header, as it contains ambiguous headers. "
269  "Header field name \"" + field + "\" matches multiple types of data columns."
270  );
271  }
272 }
273 
274 // -------------------------------------------------------------------------
275 // evaluate_if_field_is_chr_
276 // -------------------------------------------------------------------------
277 
278 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_chr_(
279  std::string const& field
280 ) {
281  assert( parent_ );
282  assert( ! field.empty() );
283 
284  // Check if we find a matching header name. Either it matches the user provided header column
285  // name, if that is provided, or it matches any of our predefined names for that column.
286  // We don't actually need to check if the usr_chr_name_ is non-empty, as it only matches the
287  // non-empty field if it also is non-empty. If it doesn't match either, we are done here.
288  if( ! match_header_field_( field, parent_->usr_chr_name_, parent_->chr_names_ )) {
289  return 0;
290  }
291 
292  // Checks that the type of column was not already given via some other column header.
293  if( header_info_.has_chr ) {
294  throw std::runtime_error(
295  "Cannot unambiguously parse frequency table header, "
296  "as it contains multiple columns for the chromosome."
297  );
298  }
299  header_info_.has_chr = true;
300 
301  // We need a copy of the ptr, so that we can capture this with a lambda.
302  // In C++11, lambda cannot yet directly capture class members...
303  auto cur_var = current_variant_;
304  auto const sep_char = parent_->separator_char_;
305 
306  // Add processing function that will be used for parsing values of this column.
307  column_processors_.push_back( [cur_var, sep_char]( genesis::utils::InputStream& it ){
308  // Read the chromosome name, and check its validity.
309  cur_var->chromosome = utils::read_until(
310  it, [sep_char]( char c ){
311  return ! utils::is_graph( c ) || c == '\n' || c == sep_char;
312  }
313  );
314  if( cur_var->chromosome.empty() ) {
315  throw std::runtime_error(
316  "Malformed frequency table with empty chromosome name in line " + it.at()
317  );
318  }
319  });
320 
321  // Indicate that we found a matching header name.
322  return 1;
323 }
324 
325 // -------------------------------------------------------------------------
326 // evaluate_if_field_is_pos_
327 // -------------------------------------------------------------------------
328 
329 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_pos_(
330  std::string const& field
331 ) {
332  // Same setup as in evaluate_if_field_is_chr_(). See there for comments.
333  // Only difference is the parsing function in the processor lambda.
334 
335  assert( parent_ );
336  assert( ! field.empty() );
337  if( ! match_header_field_( field, parent_->usr_pos_name_, parent_->pos_names_ )) {
338  return 0;
339  }
340  if( header_info_.has_pos ) {
341  throw std::runtime_error(
342  "Cannot unambiguously parse frequency table header, "
343  "as it contains multiple columns for the position."
344  );
345  }
346  header_info_.has_pos = true;
347  auto cur_var = current_variant_;
348  column_processors_.push_back( [cur_var]( genesis::utils::InputStream& it ){
349  cur_var->position = utils::parse_unsigned_integer<size_t>( it );
350  if( cur_var->position == 0 ) {
351  throw std::runtime_error(
352  "Malformed frequency table with position == 0 in line " + it.at()
353  );
354  }
355  });
356  return 1;
357 }
358 
359 // -------------------------------------------------------------------------
360 // evaluate_if_field_is_ref_
361 // -------------------------------------------------------------------------
362 
363 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_ref_(
364  std::string const& field
365 ) {
366  // Same setup as in evaluate_if_field_is_chr_(). See there for comments.
367  // Only difference is the parsing function in the processor lambda.
368 
369  assert( parent_ );
370  assert( ! field.empty() );
371  if( ! match_header_field_( field, parent_->usr_ref_name_, parent_->ref_names_ )) {
372  return 0;
373  }
374  if( header_info_.has_ref ) {
375  throw std::runtime_error(
376  "Cannot unambiguously parse frequency table header, "
377  "as it contains multiple columns for the reference base."
378  );
379  }
380  header_info_.has_ref = true;
381  auto cur_var = current_variant_;
382  column_processors_.push_back( [cur_var]( genesis::utils::InputStream& it ){
383  // Read the single char base, and check it.
384  // This even works when we are at the end of the data already.
385  auto const b = utils::to_upper( *it );
386  if( ! is_valid_base_or_n( b )) {
387  throw std::runtime_error(
388  "Malformed frequency table with reference base not in [ACGTN] in line " + it.at()
389  );
390  }
391  cur_var->reference_base = b;
392  ++it;
393  });
394  return 1;
395 }
396 
397 // -------------------------------------------------------------------------
398 // evaluate_if_field_is_alt_
399 // -------------------------------------------------------------------------
400 
401 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_alt_(
402  std::string const& field
403 ) {
404  // Same setup as in evaluate_if_field_is_chr_(). See there for comments.
405  // Only difference is the parsing function in the processor lambda.
406 
407  assert( parent_ );
408  assert( ! field.empty() );
409  if( ! match_header_field_( field, parent_->usr_alt_name_, parent_->alt_names_ )) {
410  return 0;
411  }
412  if( header_info_.has_alt ) {
413  throw std::runtime_error(
414  "Cannot unambiguously parse frequency table header, "
415  "as it contains multiple columns for the alternative base."
416  );
417  }
418  header_info_.has_alt = true;
419  auto cur_var = current_variant_;
420  column_processors_.push_back( [cur_var]( genesis::utils::InputStream& it ){
421  // Same as above for the ref base.
422  auto const b = utils::to_upper( *it );
423  if( ! is_valid_base_or_n( b )) {
424  throw std::runtime_error(
425  "Malformed frequency table with alternative base not in [ACGTN] in line " + it.at()
426  );
427  }
428  cur_var->alternative_base = b;
429  ++it;
430  });
431  return 1;
432 }
433 
434 // -------------------------------------------------------------------------
435 // evaluate_if_field_is_sample_ref_
436 // -------------------------------------------------------------------------
437 
438 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_ref_(
439  std::string const& field,
440  std::unordered_set<std::string>& all_samplenames
441 ) {
442  // Here, we use a different matching algorithm than for the fixed fields:
443  // We try to find a match that also contains a prefix or suffix that is the sample name.
444  // Furthermore, we then need to take care of setting up the sample information
445  // and the data accordingly.
446 
447  // Check if any combination of names fits the fields, and store the remaining sample name.
448  assert( parent_ );
449  assert( ! field.empty() );
450  std::string samplename;
451  if( ! match_header_sample_(
452  field, parent_->usr_smp_ref_name_, parent_->ref_names_, parent_->cnt_names_, samplename
453  )) {
454  return 0;
455  }
456 
457  // Insert the sample name into the list, for later checking.
458  all_samplenames.insert( samplename );
459 
460  // Check if we ignore the sample. We do this here before it has been added to the sample info.
461  // We still read the value, to make sure that it's good data, but don't do anything with it.
462  // We return 1 here to indicate that we matched with a pattern successfully.
463  if( is_ignored_sample_( samplename )) {
464  // Local copy of the parent pointer, so that we can capture it in the lambda in C++11...
465  auto parent = parent_;
466  column_processors_.push_back( [parent]( genesis::utils::InputStream& it ){
467  // If the filed is missing data, we parse it in the function to check that.
468  // If it isn't, we parese it as an int, but discard the value.
469  if( ! parse_if_missing_( parent, it ) ) {
470  utils::parse_unsigned_integer<size_t>( it );
471  }
472  });
473  return 1;
474  }
475 
476  // First check if we have the sample already, or need to create a new one in the info.
477  // Then, check that a ref base count was not already found for this sample name.
478  auto& sample_info = get_sample_info_( samplename );
479  if( sample_info.has_ref ) {
480  throw std::runtime_error(
481  "Cannot unambiguously parse frequency table header, as it contains multiple columns "
482  "for the reference count of sample \"" + samplename + "\"."
483  );
484  }
485  sample_info.has_ref = true;
486 
487  // Now we have a match, and need to set up the column for the sample.
488  // We again use a copy of the shared ptr here, so that the lambda can capture it,
489  // and we capture the index by value. We cannot capture the adress of the sample data object
490  // itself here, as the vector might get resized if more columns are added, so the address
491  // is not stable. Hence, we are using the index. There might be other ways to implement that
492  // (using a vector of pointers, or only added the lambdas later once we know how many samples
493  // there are), but that does neither seem faster nor simpler, so we stick with this approach.
494  assert( sample_info.index < std::numeric_limits<size_t>::max() );
495  auto parent = parent_;
496  auto sample_data = sample_data_;
497  auto index = sample_info.index;
498  column_processors_.push_back( [parent, sample_data, index]( genesis::utils::InputStream& it ){
499  assert( index < sample_data->size() );
500  if( parse_if_missing_( parent, it ) ) {
501  sample_data->at(index).is_missing = true;
502  } else {
503  sample_data->at(index).ref_cnt = utils::parse_unsigned_integer<size_t>( it );
504  }
505  });
506 
507  // Indicate that we found a matching header name.
508  return 1;
509 }
510 
511 // -------------------------------------------------------------------------
512 // evaluate_if_field_is_sample_alt_
513 // -------------------------------------------------------------------------
514 
515 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_alt_(
516  std::string const& field,
517  std::unordered_set<std::string>& all_samplenames
518 ) {
519  // Same as above in evaluate_if_field_is_sample_ref_(), but without comments here
520  // to keep it shorter. See there for explanations.
521  // This is quite some code duplication, but we have this intermixed with variable access
522  // of the data that we are writing to, etc, and would be quite the cumbersone template
523  // function to use instead... not quite sure if this is better.
524 
525  assert( parent_ );
526  assert( ! field.empty() );
527  std::string samplename;
528  if( ! match_header_sample_(
529  field, parent_->usr_smp_alt_name_, parent_->alt_names_, parent_->cnt_names_, samplename
530  )) {
531  return 0;
532  }
533  all_samplenames.insert( samplename );
534  if( is_ignored_sample_( samplename )) {
535  auto parent = parent_;
536  column_processors_.push_back( [parent]( genesis::utils::InputStream& it ){
537  if( ! parse_if_missing_( parent, it ) ) {
538  utils::parse_unsigned_integer<size_t>( it );
539  }
540  });
541  return 1;
542  }
543  auto& sample_info = get_sample_info_( samplename );
544  if( sample_info.has_alt ) {
545  throw std::runtime_error(
546  "Cannot unambiguously parse frequency table header, as it contains multiple columns "
547  "for the alternative count of sample \"" + samplename + "\"."
548  );
549  }
550  sample_info.has_alt = true;
551  assert( sample_info.index < std::numeric_limits<size_t>::max() );
552  auto parent = parent_;
553  auto sample_data = sample_data_;
554  auto index = sample_info.index;
555  column_processors_.push_back( [parent, sample_data, index]( genesis::utils::InputStream& it ){
556  assert( index < sample_data->size() );
557  if( parse_if_missing_( parent, it ) ) {
558  sample_data->at(index).is_missing = true;
559  } else {
560  sample_data->at(index).alt_cnt = utils::parse_unsigned_integer<size_t>( it );
561  }
562  });
563  return 1;
564 }
565 
566 // -------------------------------------------------------------------------
567 // evaluate_if_field_is_sample_frq_
568 // -------------------------------------------------------------------------
569 
570 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_frq_(
571  std::string const& field,
572  std::unordered_set<std::string>& all_samplenames
573 ) {
574  // Same as above in evaluate_if_field_is_sample_ref_(), but without comments here
575  // to keep it shorter. See there for explanations.
576 
577  assert( parent_ );
578  assert( ! field.empty() );
579  std::string samplename;
580  if( ! match_header_sample_(
581  field, parent_->usr_smp_frq_name_, parent_->frq_names_, samplename
582  )) {
583  return 0;
584  }
585  all_samplenames.insert( samplename );
586  if( is_ignored_sample_( samplename )) {
587  auto parent = parent_;
588  column_processors_.push_back( [parent]( genesis::utils::InputStream& it ){
589  if( ! parse_if_missing_( parent, it ) ) {
590  utils::parse_float<double>( it );
591  }
592  });
593  return 1;
594  }
595  auto& sample_info = get_sample_info_( samplename );
596  if( sample_info.has_frq ) {
597  throw std::runtime_error(
598  "Cannot unambiguously parse frequency table header, as it contains multiple columns "
599  "for the frequency of sample \"" + samplename + "\"."
600  );
601  }
602  sample_info.has_frq = true;
603  assert( sample_info.index < std::numeric_limits<size_t>::max() );
604  auto parent = parent_;
605  auto sample_data = sample_data_;
606  auto index = sample_info.index;
607  column_processors_.push_back( [parent, sample_data, index]( genesis::utils::InputStream& it ){
608  assert( index < sample_data->size() );
609  if( parse_if_missing_( parent, it ) ) {
610  sample_data->at(index).is_missing = true;
611  } else {
612  sample_data->at(index).frq = utils::parse_float<double>( it );
613  }
614  });
615  return 1;
616 }
617 
618 // -------------------------------------------------------------------------
619 // evaluate_if_field_is_sample_cov_
620 // -------------------------------------------------------------------------
621 
622 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_cov_(
623  std::string const& field,
624  std::unordered_set<std::string>& all_samplenames
625 ) {
626  // Same as above in evaluate_if_field_is_sample_ref_(), but without comments here
627  // to keep it shorter. See there for explanations.
628 
629  assert( parent_ );
630  assert( ! field.empty() );
631  std::string samplename;
632  if( ! match_header_sample_(
633  field, parent_->usr_smp_cov_name_, parent_->cov_names_, samplename
634  )) {
635  return 0;
636  }
637  all_samplenames.insert( samplename );
638  if( is_ignored_sample_( samplename )) {
639  auto parent = parent_;
640  column_processors_.push_back( [parent]( genesis::utils::InputStream& it ){
641  if( ! parse_if_missing_( parent, it ) ) {
642  utils::parse_unsigned_integer<size_t>( it );
643  }
644  });
645  return 1;
646  }
647  auto& sample_info = get_sample_info_( samplename );
648  if( sample_info.has_cov ) {
649  throw std::runtime_error(
650  "Cannot unambiguously parse frequency table header, as it contains multiple columns "
651  "for the read depth of sample \"" + samplename + "\"."
652  );
653  }
654  sample_info.has_cov = true;
655  assert( sample_info.index < std::numeric_limits<size_t>::max() );
656  auto parent = parent_;
657  auto sample_data = sample_data_;
658  auto index = sample_info.index;
659  column_processors_.push_back( [parent, sample_data, index]( genesis::utils::InputStream& it ){
660  assert( index < sample_data->size() );
661  if( parse_if_missing_( parent, it ) ) {
662  sample_data->at(index).is_missing = true;
663  } else {
664  sample_data->at(index).cov = utils::parse_unsigned_integer<size_t>( it );
665  }
666  });
667  return 1;
668 }
669 
670 // -------------------------------------------------------------------------
671 // Sample Helpers
672 // -------------------------------------------------------------------------
673 
674 FrequencyTableInputStream::Iterator::SampleInfo&
675 FrequencyTableInputStream::Iterator::get_sample_info_(
676  std::string const& samplename
677 ) {
678  // Get the sample type object for a given sample name from the header info.
679  // If the name is not yet present, create it and set its index.
680  if( header_info_.sample_infos.count( samplename ) == 0 ) {
681  // We use the next available index and create a new entry in the map.
682  // Has to be two separate lines for this to work, I think.
683  auto const next_index = header_info_.sample_infos.size();
684  header_info_.sample_infos[samplename].index = next_index;
685  }
686 
687  // Now return the element, either an existing one, or the one just created.
688  // We assert that the element is there, and all indices in order in the vec and map.
689  assert( header_info_.sample_infos.count( samplename ) > 0 );
690  assert( header_info_.sample_infos[ samplename ].index < std::numeric_limits<size_t>::max() );
691  return header_info_.sample_infos[ samplename ];
692 }
693 
694 bool FrequencyTableInputStream::Iterator::is_ignored_sample_(
695  std::string const& samplename
696 ) const {
697  // We only use the filtering if it has been set to an actual list of names.
698  assert( parent_ );
699  if( parent_->sample_names_filter_.empty() ) {
700  return false;
701  }
702 
703  // Return whether that sample shall be ignored.
704  auto const found = ( parent_->sample_names_filter_.count( samplename ) > 0 );
705  return !( found ^ parent_->inverse_sample_names_filter_ );
706 }
707 
708 bool FrequencyTableInputStream::Iterator::parse_if_missing_(
709  FrequencyTableInputStream const* parent,
710  genesis::utils::InputStream& input_stream
711 ) {
712  auto const buffer = input_stream.buffer();
713 
714  // Do a case insentive comparison of two char arrays.
715  // The comparison functions are short circuited so that they are only called when
716  // the buffer actually has sufficient data.
717  auto check_missing_and_skip_ = [&input_stream](
718  char const* lhs, size_t lhs_len, char const* rhs, size_t rhs_len
719  ) {
720  if(
721  lhs_len >= rhs_len &&
722  strncasecmp( lhs, rhs, rhs_len ) == 0
723  ) {
724  input_stream.jump_unchecked( rhs_len );
725  return true;
726  }
727  return false;
728  };
729 
730  // If user-provided missing string is given, use that. Otherwise, try all defaults.
731  // If we find any of the missing indicators, we move the input forward beyond it,
732  // and return true. Otherwise, we do nothing to the stream, and return false.
733  if( parent->usr_missing_.empty() ) {
734  for( auto const& missing_word : parent->missing_ ) {
735  auto const is_missing = check_missing_and_skip_(
736  buffer.first, buffer.second,
737  missing_word.c_str(), missing_word.size()
738  );
739  if( is_missing ) {
740  return true;
741  }
742  }
743  } else {
744  auto const is_missing = check_missing_and_skip_(
745  buffer.first, buffer.second,
746  parent->usr_missing_.c_str(), parent->usr_missing_.size()
747  );
748  if( is_missing ) {
749  return true;
750  }
751  }
752  return false;
753 }
754 
755 // -------------------------------------------------------------------------
756 // String Matching Helpers
757 // -------------------------------------------------------------------------
758 
759 bool FrequencyTableInputStream::Iterator::match_header_field_(
760  std::string const& field,
761  std::string const& user_string,
762  std::vector<std::string> const& predefined_list
763 ) const {
764  // If the user string is not empty, we want to match against it.
765  // Only if it is empty, we want to try to match against the predefined list.
766  // Otherwise, if the user string is provided but does not match, we would still compare
767  // against our predifined strings, hence defying the purpose of a user provided string,
768  // as then the user would not have control over what the exact match should be like any more.
769  assert( ! field.empty() );
770  if( ! user_string.empty() ) {
771  return field == user_string;
772  }
773  return utils::contains_ci_alnum( predefined_list, field );
774 }
775 
776 bool FrequencyTableInputStream::Iterator::match_header_sample_(
777  std::string const& field,
778  std::string const& user_substring,
779  std::vector<std::string> const& predefined_list,
780  std::string& samplename
781 ) const {
782  // First try to find an exact match with the prefix or suffix provided by the user.
783  // This needs to be exact in terms of case sensitivity and non-alnum chars.
784  // Same as above, if a user string is provided, we only match against that,
785  // but then don't continue to match against the predifined lists as well.
786  assert( ! field.empty() );
787  if( ! user_substring.empty() ) {
788  return match_header_sample_user_partial_( field, user_substring, samplename );
789  }
790 
791  // If that did not work, we try the predefined lists instead:
792  // Try to find the field as a prefix or a suffix in any of the elements of the list.
793  for( auto const& name : predefined_list ) {
794  if( match_header_sample_predefined_partial_( field, name, samplename )) {
795  return true;
796  }
797  }
798  return false;
799 }
800 
801 bool FrequencyTableInputStream::Iterator::match_header_sample_(
802  std::string const& field,
803  std::string const& user_substring,
804  std::vector<std::string> const& predefined_list1,
805  std::vector<std::string> const& predefined_list2,
806  std::string& samplename
807 ) const {
808  // First try to find an exact match with the prefix or suffix provided by the user.
809  // This needs to be exact in terms of case sensitivity and non-alnum chars.
810  // Same as above, if a user string is provided, we only match against that,
811  // but then don't continue to match against the predifined lists as well.
812  assert( ! field.empty() );
813  if( ! user_substring.empty() ) {
814  return match_header_sample_user_partial_( field, user_substring, samplename );
815  }
816 
817  // If that did not work, we try the predefined lists instead:
818  // Try to find the field as a prefix or a suffix in any of the elements of the combination
819  // of both lists, in any order. This is rather cumbersome, but we only do that for the header
820  // line, so it's okay.
821  // For example, we might get list1 == ref_names_, and list2 == cnt_names_,
822  // and then try to find all their pairwise combinations, such as "refcnt" or "countreference"
823  // as a prefix or a suffix of the field. If any matches, we take the rest of the field that
824  // was not machted as the sample name.
825  for( auto const& name1 : predefined_list1 ) {
826  for( auto const& name2 : predefined_list2 ) {
827  auto name = name1 + name2;
828  if( match_header_sample_predefined_partial_( field, name, samplename )) {
829  return true;
830  }
831  name = name2 + name1;
832  if( match_header_sample_predefined_partial_( field, name, samplename )) {
833  return true;
834  }
835  }
836  }
837  return false;
838 }
839 
840 bool FrequencyTableInputStream::Iterator::match_header_sample_user_partial_(
841  std::string const& field,
842  std::string const& substring,
843  std::string& samplename
844 ) const {
845  // Check for exact prefix or suffix matches, and also require that there needs to be
846  // a remainder to be used as sample name, i.e., that not the whole substring matches.
847  if( utils::starts_with( field, substring, samplename ) && ! samplename.empty() ) {
848  return true;
849  }
850  if( utils::ends_with( field, substring, samplename ) && ! samplename.empty() ) {
851  return true;
852  }
853  return false;
854 }
855 
856 bool FrequencyTableInputStream::Iterator::match_header_sample_predefined_partial_(
857  std::string const& field,
858  std::string const& substring,
859  std::string& samplename
860 ) const {
861  // Check for case insensitive, and only alnum char, prefix or suffix matches, and also require
862  // that there needs to be a remainder to be used as sample name, i.e., that not the whole
863  // substring matches.
864  if( utils::starts_with_ci_alnum( field, substring, samplename, true ) && ! samplename.empty() ) {
865  return true;
866  }
867  if( utils::ends_with_ci_alnum( field, substring, samplename, true ) && ! samplename.empty() ) {
868  return true;
869  }
870  return false;
871 }
872 
873 // =================================================================================================
874 // Increment and Processing Samples
875 // =================================================================================================
876 
877 // -------------------------------------------------------------------------
878 // increment_
879 // -------------------------------------------------------------------------
880 
881 void FrequencyTableInputStream::Iterator::increment_()
882 {
883  using namespace genesis::utils;
884  assert( input_stream_ );
885  assert( parent_ );
886  auto& it = *input_stream_;
887 
888  // The previous iteration reached the end. We only set the iterator to end now, so that the
889  // previous line is actually being processed by the user.
890  if( ! it ) {
891  parent_ = nullptr;
892  return;
893  }
894 
895  // We need to reset the internal sample data, so that any remnants of a previous iteration
896  // are removed. In particular, we need to reset the is_missing information here.
897  assert( sample_data_ );
898  for( auto& data : *sample_data_ ) {
899  data = SampleData();
900  }
901 
902  // Process all columns, using the processor lambda functions one after another
903  // in the order that we expect the columns to be in.
904  size_t processor_index = 0;
905  while( it && *it != '\n' ) {
906  if( processor_index >= column_processors_.size() ) {
907  throw std::runtime_error(
908  "Error while processing frequency table: More columns in line " +
909  std::to_string( it.line() ) + " than in the file header."
910  );
911  }
912 
913  // Process the column. That reads stuff into the Variant or into the SampleData,
914  // and leaves the stream at the next char after reading, i.e., the separator or new line.
915  column_processors_[processor_index]( it );
916 
917  // Check that this is actually the case, and we are left where we expect to be.
918  if( it && ( *it != '\n' && *it != parent_->separator_char_ )) {
919  throw std::runtime_error(
920  "Error while processing frequency table: Unexpected char " +
921  utils::char_to_hex( *it ) + " at " + it.at()
922  );
923  }
924 
925  // We are at the end of the field or line. Go to the next field, if there is one.
926  assert( !it || ( *it == '\n' || *it == parent_->separator_char_ ));
927  if( it && *it == parent_->separator_char_ ) {
928  ++it;
929  }
930 
931  // We are done with this column. The next one needs to be processed differently.
932  ++processor_index;
933  }
934  assert( !it || *it == '\n');
935  ++it;
936 
937  // Fewer columns than were given in the header.
938  // If it's not the correct size, it's wrong. We check for too many columns above already,
939  // so here, if its off, it's too few; we still check both, just to be sure.
940  if( processor_index != column_processors_.size() ) {
941  assert( processor_index < column_processors_.size() );
942  throw std::runtime_error(
943  "Error while processing frequency table: Fewer columns in line " +
944  std::to_string( it.line() - 1 ) + " than in the file header."
945  );
946  }
947 
948  // Process the ref and alt bases, with and without a given ref genome.
949  if( parent_->ref_genome_ ) {
950  // Get the current ref genome base.
951  assert( current_variant_->chromosome.size() > 0 );
952  assert( current_variant_->position > 0 );
953  auto const ref_gen_base = parent_->ref_genome_->get_base(
954  current_variant_->chromosome, current_variant_->position
955  );
956 
957  // Both ref genome and ref column are given and have a usable value. Try to match them.
958  if( header_info_.has_ref && utils::to_upper( current_variant_->reference_base ) != 'N' ) {
959  // Get a shorthand, and check the bases that the processor allows.
960  auto const ref_base = utils::to_upper( current_variant_->reference_base );
961  assert( is_valid_base( ref_base ));
962 
963  // Both are given and the base from the file is not 'N', so let's see if they agree.
964  // If not, that indicates some issue, so better be careful.
965  // We allow the ref genome to use ambiguity bases though.
966  if( ! sequence::nucleic_acid_code_containment( ref_gen_base, ref_base )) {
967  throw std::runtime_error(
968  "At chromosome \"" + current_variant_->chromosome + "\" position " +
969  std::to_string( current_variant_->position ) +
970  ", the provided reference genome has base '" +
971  std::string( 1, ref_gen_base ) +
972  "', while the reference base column in the frequency file is '" +
973  std::string( 1, ref_base ) +
974  "', which is not contained in the referenge genome, " +
975  "and hence likely indicates an issue with the data"
976  );
977  }
978  } else {
979  assert( header_info_.has_ref || current_variant_->reference_base == 'N' );
980 
981  // Here, we have the case where either there is no ref base from the input file,
982  // or it's 'N', so that we might want to replace it by the ref genome. Both cases are
983  // treated the same here: Check that we can use the genome base, or use N if not.
984  if( is_valid_base( ref_gen_base )) {
985  current_variant_->reference_base = utils::to_upper( ref_gen_base );
986  } else {
987  current_variant_->reference_base = 'N';
988  }
989  }
990  } else {
991  // If we do not have columns for ref and/or alt base, and no reference genome,
992  // they should have been left at 'N'.
993  // We use short-circuit or here: If it has ref/alt, the second part is not evaluated.
994  assert( header_info_.has_ref || current_variant_->reference_base == 'N' );
995  assert( header_info_.has_alt || current_variant_->alternative_base == 'N' );
996  }
997 
998  // Make sure all sizes of the involved data are in sync.
999  assert( sample_data_ );
1000  assert( current_variant_ );
1001  assert( header_info_.sample_infos.size() == sample_data_->size() );
1002  assert( header_info_.sample_infos.size() == current_variant_->samples.size() );
1003 
1004  // Now turn all intermediate data into sample counts.
1005  // We go in random order here, following the content of header_info_.sample_infos
1006  // Not sure if that matters much speed-wise, but it's the easiest to implement for now.
1007  for( auto const& sample_info : header_info_.sample_infos ) {
1008  auto const index = sample_info.second.index;
1009  assert( index < sample_data_->size() );
1010  assert( index < current_variant_->samples.size() );
1011  process_sample_data_( sample_info.second, sample_data_->at(index), *current_variant_, index );
1012  }
1013 
1014  // Set the status of the Variant. If all samples are missing, so is this Variant.
1015  current_variant_->status.reset();
1016  size_t missing_count = 0;
1017  for( auto const& sample : current_variant_->samples ) {
1018  if( sample.status.is( SampleCountsFilterTag::kMissing )) {
1019  ++missing_count;
1020  }
1021  }
1022  if( missing_count == current_variant_->samples.size() ) {
1023  current_variant_->status.set( VariantFilterTag::kMissing );
1024  }
1025 }
1026 
1027 // -------------------------------------------------------------------------
1028 // process_sample_data_
1029 // -------------------------------------------------------------------------
1030 
1031 void FrequencyTableInputStream::Iterator::process_sample_data_(
1032  FrequencyTableInputStream::Iterator::SampleInfo const& sample_info,
1033  FrequencyTableInputStream::Iterator::SampleData const& sample_data,
1034  Variant& variant,
1035  size_t sample_index
1036 ) {
1037  // Store the counts that we get first here,
1038  // and then use one routine to assign them to the sample later.
1039  SampleCounts::size_type ref_cnt = 0;
1040  SampleCounts::size_type alt_cnt = 0;
1041  bool do_frq_check = false;
1042 
1043  // Reset the sample, and skip everything else if this is missing data.
1044  variant.samples[sample_index] = SampleCounts();
1045  if( sample_data.is_missing ) {
1046  variant.samples[sample_index].status.set( SampleCountsFilterTag::kMissing );
1047  return;
1048  }
1049 
1050  // Check which of all combinations of input column types that we offer is given for this sample,
1051  // and process it accordingly.
1052  if( sample_info.has_ref && sample_info.has_alt ) {
1053 
1054  // Simple case, just use the counts.
1055  ref_cnt = sample_data.ref_cnt;
1056  alt_cnt = sample_data.alt_cnt;
1057  do_frq_check = true;
1058 
1059  // Check that the read depth fits.
1060  // We are dealing with integers here, so this needs to be exact.
1061  if( sample_info.has_cov && sample_data.cov != sample_data.ref_cnt + sample_data.alt_cnt ) {
1062  throw std::runtime_error(
1063  "Invalid read depth that is not the sum of the reference and alternative base counts."
1064  );
1065  }
1066 
1067  } else if( sample_info.has_ref && sample_info.has_cov ) {
1068 
1069  // Already have checked that case above.
1070  assert( ! sample_info.has_alt );
1071 
1072  // Check that the values are valid.
1073  if( sample_data.cov < sample_data.ref_cnt ) {
1074  throw std::runtime_error(
1075  "Invalid read depth that is smaller than the reference base count."
1076  );
1077  }
1078 
1079  // Set them to the respective counters.
1080  ref_cnt = sample_data.ref_cnt;
1081  alt_cnt = sample_data.cov - sample_data.ref_cnt;
1082  do_frq_check = true;
1083 
1084  } else if( sample_info.has_alt && sample_info.has_cov ) {
1085 
1086  // Same as above, but the other way round.
1087  // Highly unlikely case, but hey, let's implement it.
1088  assert( ! sample_info.has_ref );
1089  throw std::runtime_error(
1090  "Invalid read depth that is smaller than the alternative base count."
1091  );
1092  ref_cnt = sample_data.cov - sample_data.alt_cnt;
1093  alt_cnt = sample_data.ref_cnt;
1094  do_frq_check = true;
1095 
1096  } else if( sample_info.has_frq ) {
1097 
1098  // We only have at max one count variable.
1099  assert(
1100  static_cast<int>( sample_info.has_ref ) +
1101  static_cast<int>( sample_info.has_alt ) +
1102  static_cast<int>( sample_info.has_cov )
1103  <= 1
1104  );
1105 
1106  // Get the frequency, and check if it is within tolerance,
1107  // and and process it to be within the unit interval.
1108  auto frq = sample_data.frq;
1109  if( frq < 0.0 ) {
1110  if( ! utils::almost_equal_relative( frq, 0.0, parent_->allowed_rel_freq_error_ )) {
1111  throw std::runtime_error( "Invalid frequency < 0.0 in frequency table." );
1112  }
1113  frq = 0.0;
1114  }
1115  if( frq > 1.0 ) {
1116  if( ! utils::almost_equal_relative( frq, 1.0, parent_->allowed_rel_freq_error_ )) {
1117  throw std::runtime_error( "Invalid frequency > 1.0 in frequency table." );
1118  }
1119  frq = 1.0;
1120  }
1121  assert(( !std::isfinite( frq )) ^ ( 0.0 <= frq && frq <= 1.0 ));
1122 
1123  // Go through different ways of computing counts from the frequency.
1124  // We can only ever have one of the counts be set, as otherwise, one of the previous
1125  // ways of computing the counts (see above) would already have kicked in, and we
1126  // would not have ended up here.
1127  if( ! std::isfinite( frq )) {
1128  // Non-finite frequencies are invalid data. Nothing else to do.
1129  variant.samples[sample_index].status.set( SampleCountsFilterTag::kInvalid );
1130  return;
1131  } else if( sample_info.has_cov ) {
1132  // Avoid rounding errors by doing the second number directly on integers.
1133  ref_cnt = static_cast<SampleCounts::size_type>( sample_data.cov * frq );
1134  alt_cnt = sample_data.cov - ref_cnt;
1135  } else if( sample_info.has_ref ) {
1136  // We have a frequency f, and a ref count r.
1137  // Resolve f = r/(r+a) accordingly to get the alt count a.
1138  // Here, we need to introduce some rounding. No way around this.
1139  // Let's hope for the best - in particular, that all numbers behave nicely.
1140  ref_cnt = sample_data.ref_cnt;
1141  auto const ref_dbl = static_cast<double>( ref_cnt );
1142  alt_cnt = static_cast<SampleCounts::size_type>(( ref_dbl / frq ) - ref_dbl );
1143  } else if( sample_info.has_alt ) {
1144  // Same idea as above, but the other way round.
1145  alt_cnt = sample_data.alt_cnt;
1146  auto const alt_dbl = static_cast<double>( alt_cnt );
1147  ref_cnt = static_cast<SampleCounts::size_type>( alt_dbl / (( 1.0 / frq ) - 1.0 ));
1148  } else {
1149  // If no count is given at all, we use a different strategy instead.
1150  // Multiply by our large number, to get an int that can be prepresented in double.
1151  auto const int_fac = parent_->int_factor_;
1152  ref_cnt = static_cast<SampleCounts::size_type>( int_fac * frq );
1153  alt_cnt = static_cast<SampleCounts::size_type>( int_fac ) - ref_cnt;
1154 
1155  // Also, make sure that the large number actually fits into the size type.
1156  // We check that when setting the value, but let's be safe.
1157  assert(
1158  static_cast<double>( static_cast<SampleCounts::size_type>( int_fac )) == int_fac
1159  );
1160  }
1161 
1162  // Lastly, assign to where the counts belong.
1163  // If the freq does not correspond to the ref but to the alt, we need to flip them.
1164  if( ! parent_->frequency_is_ref_ ) {
1165  std::swap( ref_cnt, alt_cnt );
1166  }
1167 
1168  } else {
1169  // This case should not happen, as we would have thrown an exception when parsing
1170  // the header already, as we check there that we have at least one of the above cases.
1171  throw std::domain_error( "Internal error: No valid data type to parse frequency table." );
1172  }
1173 
1174  // Now that we have processed the data, we can use the final counts to check the frequency.
1175  if( do_frq_check && sample_info.has_frq ) {
1176  auto const ref = static_cast<double>( ref_cnt );
1177  auto const alt = static_cast<double>( alt_cnt );
1178  auto const frq = ( parent_->frequency_is_ref_ ? ref : alt ) / ( ref + alt );
1179  if( ! utils::almost_equal_relative( frq, sample_data.frq, parent_->allowed_rel_freq_error_ )) {
1180  throw std::runtime_error(
1181  "Mismatching frequency value ~" + std::to_string( sample_data.frq ) +
1182  " that has a difference greater than the allowed relative error (" +
1183  std::to_string( parent_->allowed_rel_freq_error_ ) + ") to the frequency " +
1184  std::to_string( frq ) + " determined by the reference count " +
1185  std::to_string( ref_cnt ) + " and alternative count " + std::to_string( alt_cnt )
1186  );
1187  }
1188  }
1189 
1190  // Now store the counts in the sample, using the ref/alt base info if available,
1191  // or fixed bases if ref and/or alt are not available.
1192  char ref_base = utils::to_upper( variant.reference_base );
1193  char alt_base = utils::to_upper( variant.alternative_base );
1194  assert( is_valid_base_or_n( ref_base ));
1195  assert( is_valid_base_or_n( alt_base ));
1196  if( utils::char_match_ci( ref_base, 'N' )) {
1197  // Neither base is given. We do not change the base assignment of the variant,
1198  // but we need positions to use for setting the values. Arbitrarily choose A and T.
1199  ref_base = 'A';
1200  alt_base = 'T';
1201  } else if( utils::char_match_ci( alt_base, 'N' )) {
1202  // Only ref base is given. Use its transition base as the most likely alternative.
1203  assert( is_valid_base( ref_base ));
1204  alt_base = ::genesis::sequence::nucleic_acid_transition( ref_base );
1205  }
1206  assert( sample_index < variant.samples.size() );
1207  assert( ref_base != 'N' && ref_base != 'n' );
1208  assert( alt_base != 'N' && alt_base != 'n' );
1209  if( ref_base == alt_base ) {
1210  throw std::runtime_error(
1211  "At chromosome \"" + variant.chromosome + "\" position " +
1212  std::to_string( variant.position ) +
1213  ": Invalid reference and alternative base that are both '" +
1214  std::string( 1, ref_base ) + "' in frequency table."
1215  );
1216  }
1217 
1218  // Set the base counts
1219  set_base_count( variant.samples[sample_index], ref_base, ref_cnt );
1220  set_base_count( variant.samples[sample_index], alt_base, alt_cnt );
1221 }
1222 
1223 } // namespace population
1224 } // namespace genesis
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:437
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
genesis::utils::is_graph
constexpr bool is_graph(char c) noexcept
Return whether a char is a character with graphical representation, according to isgraph of the cctyp...
Definition: char.hpp:166
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
parser.hpp
functions.hpp
genesis::sequence::nucleic_acid_code_containment
bool nucleic_acid_code_containment(char a, char b, bool undetermined_matches_all)
Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters i...
Definition: codes.cpp:596
genesis::utils::almost_equal_relative
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:157
genesis::utils::InputStream::buffer
std::pair< char const *, size_t > buffer()
Direct access to the internal buffer.
Definition: input_stream.hpp:390
common.hpp
genesis::utils::char_match_ci
constexpr bool char_match_ci(char c1, char c2) noexcept
Return whether two chars are the same, case insensitive, and ASCII-only.
Definition: char.hpp:243
genesis::population::set_base_count
void set_base_count(SampleCounts &sample, char base, SampleCounts::size_type value)
Set the count for a base given as a char.
Definition: population/function/functions.cpp:86
genesis::population::FrequencyTableInputStream::FrequencyTableInputStream
FrequencyTableInputStream()=default
Create a default instance, with no input.
frequency_table_input_stream.hpp
genesis::utils::starts_with
bool starts_with(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix.
Definition: string.cpp:136
genesis::utils::ends_with
bool ends_with(std::string const &text, std::string const &suffix)
Return whether a string ends with another string, i.e., check for a suffix.
Definition: string.cpp:230
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
sample_counts_filter.hpp
genesis::utils::contains_ci_alnum
bool contains_ci_alnum(std::vector< std::string > const &haystack, std::string const &needle)
Return whether a vector of strings contains a given string, case insensitive, and ignoring all non-al...
Definition: string.cpp:71
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
string.hpp
Provides some commonly used string utility functions.
genesis::utils::strncasecmp
int strncasecmp(char const *s1, char const *s2, size_t n)
Compares up to n chars of two strings, ignoring case differences.
Definition: string.cpp:90
LOG_WARN
#define LOG_WARN
Log a warning. See genesis::utils::LoggingLevel.
Definition: logging.hpp:97
genesis::population::is_valid_base_or_n
constexpr bool is_valid_base_or_n(char c)
Return whether a given base is in ACGTN, case insensitive.
Definition: population/function/functions.hpp:71
logging.hpp
Provides easy and fast logging functionality.
genesis::utils::read_until
std::string read_until(InputStream &source, char criterion)
Lexing function that reads from the stream until its current char equals the provided one....
Definition: scanner.hpp:254
genesis::population::is_valid_base
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
Definition: population/function/functions.hpp:56
genesis::sequence::nucleic_acid_transition
char nucleic_acid_transition(char code)
Return the transition base for the given base.
Definition: codes.cpp:572
genesis::population::SampleCountsFilterTag::kInvalid
@ kInvalid
Generic indicator that the sample is invalid.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::starts_with_ci_alnum
bool starts_with_ci_alnum(std::string const &text, std::string const &prefix)
Return whether a string starts with another string (prefix), comparing case-independent,...
Definition: string.cpp:170
convert.hpp
variant_filter.hpp
genesis::utils::ends_with_ci_alnum
bool ends_with_ci_alnum(std::string const &text, std::string const &suffix)
Return whether a string ends with another string (suffix), comparing case-independent,...
Definition: string.cpp:264
char.hpp
genesis::population::VariantFilterTag::kMissing
@ kMissing
Position is missing in the input data.
genesis::utils::skip_until
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one.
Definition: scanner.hpp:184
genesis::utils::InputStream::line
size_t line() const
Return the current line of the input stream.
Definition: input_stream.hpp:417
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
scanner.hpp
genesis::population::FrequencyTableInputStream::Iterator::sample_names
std::vector< std::string > sample_names() const
Return the sample names found in the header, in the order in which they are in the Variant of each it...
Definition: frequency_table_input_stream.cpp:62
genesis::population::SampleCountsFilterTag::kMissing
@ kMissing
Position is missing in the input data.
genesis::utils::InputStream::jump_unchecked
void jump_unchecked(size_t n)
Jump forward in the stream by a certain amount of chars.
Definition: input_stream.cpp:609
genesis::population::SampleCounts::size_type
size_t size_type
Public alias for the size type that the class uses to store its counts.
Definition: sample_counts.hpp:61
codes.hpp