A library for working with phylogenetic data.
v0.25.0
vcf_format_iterator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_VCF_FORMAT_ITERATOR_H_
2 #define GENESIS_POPULATION_FORMATS_VCF_FORMAT_ITERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
38 
39 #include <cassert>
40 #include <cstdint>
41 #include <memory>
42 #include <stdexcept>
43 #include <string>
44 #include <vector>
45 
46 extern "C" {
47  // #include <htslib/hts.h>
48  // #include <htslib/vcf.h>
49 
50  struct bcf_hdr_t;
51  struct bcf1_t;
52 }
53 
54 namespace genesis {
55 namespace population {
56 
57 // =================================================================================================
58 // Forward Declarations
59 // =================================================================================================
60 
61 template< typename S, typename T >
63 
64 // Declare the specializations of the iterator that are used by VcfRecord, for ease of use.
69 
70 // =================================================================================================
71 // VCF/BCF Format Helper
72 // =================================================================================================
73 
86 {
87 private:
88 
89  // Need constructor only to have static asserts have access to class members
91 
92  template< typename S, typename T >
93  friend class VcfFormatIterator;
94 
95  // Need these in VcfFormatIterator to not include the htslib headers... not sure if this is
96  // a good idea after all... maybe we should have just rolled with it and included them...
97  static const int8_t bcf_int8_vector_end_ = (-127); /* INT8_MIN + 1 */
98  static const int16_t bcf_int16_vector_end_ = (-32767); /* INT16_MIN + 1 */
99  static const int32_t bcf_int32_vector_end_ = (-2147483647); /* INT32_MIN + 1 */
100  static const int64_t bcf_int64_vector_end_ = (-9223372036854775807LL); /* INT64_MIN + 1 */
101  static const char bcf_str_vector_end_ = 0;
102  static const int8_t bcf_int8_missing_ = (-128); /* INT8_MIN */
103  static const int16_t bcf_int16_missing_ = (-32767-1); /* INT16_MIN */
104  static const int32_t bcf_int32_missing_ = (-2147483647-1); /* INT32_MIN */
105  static const int64_t bcf_int64_missing_ = (-9223372036854775807LL - 1LL); /* INT64_MIN */
106  static const char bcf_str_missing_ = 0x07;
107 
113  static int32_t bcf_hdr_nsamples_( ::bcf_hdr_t const* header );
114 
120  static std::string hdr_sample_name_( ::bcf_hdr_t const* header, size_t index );
121 
126  static int bcf_get_format_string_(
127  const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst
128  );
129 
134  static int bcf_get_format_values_(
135  const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type
136  );
137 
142  static int bcf_get_genotypes_( const bcf_hdr_t *hdr, bcf1_t *line, void **dst, int *ndst );
143 
148  static int bcf_float_is_vector_end_(float f);
149 
154  static int bcf_float_is_missing_(float f);
155 
156 };
157 
158 // =================================================================================================
159 // VCF/BCF Format/Sample Iterator
160 // =================================================================================================
161 
276 template< typename S, typename T >
277 class VcfFormatIterator
278 {
279 public:
280 
281  // -------------------------------------------------------------------------
282  // Member Types
283  // -------------------------------------------------------------------------
284 
286 
287  // -------------------------------------------------------------------------
288  // Constructors and Rule of Five
289  // -------------------------------------------------------------------------
290 
297  VcfFormatIterator() = default;
298 
307  VcfFormatIterator( ::bcf_hdr_t* header, ::bcf1_t* record, std::string const& id, VcfValueType ht_type )
308  : is_end_(false)
309  , header_(header)
310  , record_(record)
311  {
312  // TODO Remove all htslib function calls from this headers and put them in a cpp file instead.
313 
314  // First, get all values for the given record line and the FORMAT id tag, store them in
315  // value_buffer_, store the result of the htslib function call (which gives the total number
316  // of values that are written to value_buffer_) in values_total_, and also store the number of
317  // reserved values in values_reserved_ (we don't really need that last number as of now,
318  // but better keep it). All this needs some specialization for the different data types
319  // that we can handle here (char*, int, float), so we use a specialized function tenokate
320  // for this that is instanciated for each data type (see end of this file).
321  auto const res = construct_values_( header_, record_, id, ht_type );
322 
323  // Now, check that the result is valid, that is, res >= 0 (which has been returned
324  // by the above function call). If not, check_value_return_code_() throws an exception for us.
325  // Once that check finishes without throwing, we assert that we have proper data.
326  // In that case, we have res >= 0 indicating the number of total values, so these
327  // are the same then.
328  VcfHeader::check_value_return_code_(
329  // In order to not include the htslib headers here, we have to use a fixed value
330  // that we define ourselves, instead of their definition BCF_HL_FMT = 2.
331  // We statically assert that they are identical in vcf_common.cpp
332  header_, id, static_cast<int>( ht_type ), static_cast<int>( VcfHeaderLine::kFormat ), res
333  );
334  assert( value_buffer_ );
335  assert( res >= 0 );
336  assert( values_total_ >= 0 );
337  assert( ht_type == VcfValueType::kString || res == values_total_ );
338  assert( values_total_ <= values_reserved_ );
339 
340  // Now, as there can be multiple values per sample, we need to get the number of values
341  // per sample. htslib does not have any helpful comments on that, but their examples
342  // suggest that just division by the total number of samples works, meaning that each sample
343  // has the same number of values set in the array that we get, and that missing values are
344  // padded properly. So, let's do this, and assert that this is evenly divisible, to make
345  // sure that this is what we actually want here.
346  // Addendum after bug hunt: Apparently, this is different for char*/string data, where
347  // the number returned from the htslib call to get the valies bcf_get_format_string()
348  // corresponds to something like the longest string (plus some extra), instead of the number
349  // of values. That means that strings can never have more than one value, and that we need
350  // to set their values_total_ differently (which we do in construct_values_)...
351  // We use our wrapper VcfFormatHelper::bcf_hdr_nsamples_ here, which wraps bcf_hdr_nsamples()
352  // so that we do not have to include htslib here.
353  num_samples_ = VcfFormatHelper::bcf_hdr_nsamples_(header);
354  assert( static_cast<size_t>( values_total_ ) % num_samples_ == 0 );
355  values_per_sample_ = static_cast<size_t>( values_total_ ) / num_samples_;
356  assert( values_per_sample_ * num_samples_ == static_cast<size_t>( values_total_ ));
357 
358  // Now, go to the first value of the first sample.
359  assert( value_idx_ == -1 );
360  go_to_next_value_();
361  assert( value_idx_ >= 0 );
362  }
363 
364  ~VcfFormatIterator() = default;
365 
366  VcfFormatIterator( VcfFormatIterator const& ) = default;
367  VcfFormatIterator( VcfFormatIterator&& ) = default;
368 
369  VcfFormatIterator& operator= ( VcfFormatIterator const& ) = default;
371 
372  // -------------------------------------------------------------------------
373  // General Data Access
374  // -------------------------------------------------------------------------
375 
379  ::bcf_hdr_t* header_data()
380  {
381  return header_;
382  }
383 
387  ::bcf1_t* record_data()
388  {
389  return record_;
390  }
391 
395  size_t sample_count() const
396  {
397  return num_samples_;
398  }
399 
407  size_t values_per_sample() const
408  {
409  return values_per_sample_;
410  }
411 
412  // -------------------------------------------------------------------------
413  // Iterator for Samples
414  // -------------------------------------------------------------------------
415 
421  {
422  return *this;
423  }
424 
428  bool operator == ( self_type const& other ) const
429  {
430  // Two iterators are equal if either both of them are the past-the-end iterator (in which
431  // case we do not need to check any of the values, as they are irrelevant then),
432  // or both are not past-the-end, but all their (important) values match.
433  // We do not check the value buffer here, as this will differ for different instances.
434  return ( is_end_ && other.is_end_ ) || (
435  !is_end_ && !other.is_end_ &&
436  header_ == other.header_ && record_ == other.record_ &&
437  values_total_ == other.values_total_ && num_samples_ == other.num_samples_ &&
438  values_per_sample_ == other.values_per_sample_ &&
439  sample_idx_ == other.sample_idx_ && value_idx_ == other.value_idx_
440  );
441  }
442 
446  bool operator != ( self_type const& other ) const
447  {
448  return !( *this == other );
449  }
450 
458  {
459  ++sample_idx_;
460  if( sample_idx_ < num_samples_ ) {
461  // If we are still within the sample boundaries, go to the first valid entry of the
462  // new sample that we just moved to.
463  value_idx_ = -1;
464  go_to_next_value_();
465  assert( value_idx_ >= 0 );
466  } else {
467  // We have reached the end of the samples.
468  is_end_ = true;
469  }
470  return *this;
471  }
472 
481  {
482  auto tmp = *this;
483  ++(*this);
484  return tmp;
485  }
486 
487  // -------------------------------------------------------------------------
488  // Current element data access within the sample
489  // -------------------------------------------------------------------------
490 
496  size_t sample_index() const
497  {
498  return sample_idx_;
499  }
500 
506  size_t value_index() const
507  {
508  assert( value_idx_ >= 0 );
509  return value_idx_;
510  }
511 
516  std::string sample_name() const
517  {
518  // We use our wrapper VcfFormatHelper::bcf_hdr_nsamples_ here, which wraps bcf_hdr_nsamples()
519  // so that we do not have to include htslib here.
520  assert( static_cast<int32_t>( num_samples_ ) == VcfFormatHelper::bcf_hdr_nsamples_( header_ ));
521  assert( sample_idx_ < num_samples_ );
522 
523  // We use our wrapper VcfFormatHelper::hdr_sample_name_ here, which wraps
524  // header->samples[index]() so that we do not have to include htslib here.
525  return VcfFormatHelper::hdr_sample_name_( header_, sample_idx_ );
526  // return header_->samples[sample_idx_];
527  }
528 
539  bool has_value() const
540  {
541  // The current sample has a valid value if we are still within its bounds and
542  // if the current value is valid (not end of the vector and not missing).
543  // For both, we use the non-throwing versions of the test functions.
544  assert( value_idx_ >= 0 );
545  return
546  test_index_boundaries_( sample_idx_, value_idx_, false ) &&
547  test_valid_value_( sample_idx_, value_idx_, false )
548  ;
549  }
550 
557  T get_value() const
558  {
559  // Get the value at the current position, and do some checks. We assume here
560  // that this function is only ever called when has_next_value() is true - everything else
561  // is a user error, but we want to avoid the overhead of runtime checks in release,
562  // so, assertions it is.
563  assert( value_idx_ >= 0 );
564  assert( test_index_boundaries_( sample_idx_, value_idx_, false ));
565  assert( test_valid_value_( sample_idx_, value_idx_, false ));
566  assert( has_value() );
567 
568  // If all is good, instanciate a target type with the source type value, and return.
569  // For all our cases (string, int, float), this works without special conversions.
570  return T{ *get_value_ptr_( sample_idx_, value_idx_ ) };
571  }
572 
579  void next_value()
580  {
581  // Move to the next value, if there is one.
582  go_to_next_value_();
583  }
584 
591  size_t valid_value_count() const
592  {
593  return valid_value_count_at( sample_idx_ );
594  }
595 
596  // -------------------------------------------------------------------------
597  // Arbitrary element data access within the sample or at arbitrary sample
598  // -------------------------------------------------------------------------
599 
603  std::string sample_name_at( size_t sample_index ) const
604  {
605  test_index_boundaries_( sample_index, 0, true );
606  // We use our wrapper VcfFormatHelper::bcf_hdr_nsamples_ here, which wraps bcf_hdr_nsamples()
607  // so that we do not have to include htslib here.
608  assert( static_cast<int32_t>( num_samples_ ) == VcfFormatHelper::bcf_hdr_nsamples_( header_ ));
609  assert( sample_index < num_samples_ );
610 
611  // We use our wrapper VcfFormatHelper::hdr_sample_name_ here,
612  // which wraps header->samples[index]() so that we do not have to include htslib here.
613  return VcfFormatHelper::hdr_sample_name_( header_, sample_index );
614  // return header_->samples[sample_index];
615  }
616 
623  bool has_value_at( size_t value_index ) const
624  {
625  return has_value_at( sample_idx_, value_index );
626  }
627 
634  bool has_value_at( size_t sample_index, size_t value_index ) const
635  {
636  // Basic index boundary checks. Throws if indices are not within bounds of the arrays.
637  test_index_boundaries_( sample_index, value_index, true );
638 
639  // Get access to the value, and check that it is an actual value.
640  S* ptr = get_value_ptr_( sample_index, value_index );
641  return (! is_vector_end_( *ptr )) && (! is_missing_value_( *ptr ));
642  }
643 
647  T get_value_at( size_t value_index ) const
648  {
649  return get_value_at( sample_idx_, value_index );
650  }
651 
655  T get_value_at( size_t sample_index, size_t value_index ) const
656  {
657  // Basic index boundary checks. Throws if indices are not within bounds of the arrays.
658  test_index_boundaries_( sample_index, value_index, true );
659 
660  // Get access to the value, and check that it is an actual value.
661  // If not, the test function throws an exception.
662  test_valid_value_( sample_index, value_index, true );
663  S* ptr = get_value_ptr_( sample_index, value_index );
664 
665  // If all is good, instanciate a target type with the source type value, and return.
666  // For all our cases (string, int, float), this works without special conversions.
667  return T{ *ptr };
668  }
669 
682  std::vector<T> get_values( bool include_missing = false ) const
683  {
684  return get_values_at( sample_idx_, include_missing );
685  }
686 
692  std::vector<T> get_values_at( size_t sample_index, bool include_missing = false ) const
693  {
694  test_index_boundaries_( sample_index, 0, true );
695  // We use our wrapper VcfFormatHelper::bcf_hdr_nsamples_ here, which
696  // wraps bcf_hdr_nsamples() so that we do not have to include htslib here.
697  assert( static_cast<int32_t>( num_samples_ ) == VcfFormatHelper::bcf_hdr_nsamples_( header_ ));
698  assert( sample_index < num_samples_ );
699 
700  std::vector<T> result;
701  if( include_missing ) {
702  result.reserve( values_per_sample_ );
703  for( size_t i = 0; i < values_per_sample_; ++i ) {
704  S* ptr = get_value_ptr_( sample_index, i );
705  result.emplace_back( *ptr );
706  }
707  } else {
708  auto const cnt = valid_value_count_at( sample_index );
709  result.reserve( cnt );
710  for( size_t i = 0; i < values_per_sample_; ++i ) {
711  if( test_valid_value_( sample_index, i, false )) {
712  S* ptr = get_value_ptr_( sample_index, i );
713  result.emplace_back( *ptr );
714  }
715  }
716  }
717  return result;
718  }
719 
726  size_t valid_value_count_at( size_t sample_index ) const
727  {
728  size_t cnt = 0;
729  for( size_t i = 0; i < values_per_sample_; ++i ) {
730  if( test_valid_value_( sample_index, i, false )) {
731  ++cnt;
732  }
733  }
734  return cnt;
735  }
736 
737  // -------------------------------------------------------------------------
738  // Internal Members
739  // -------------------------------------------------------------------------
740 
741 private:
742 
743  S* get_value_ptr_( size_t sample_index, size_t value_index ) const
744  {
745  // Only called internally, so we can simply assert that the indices are within bounds.
746  assert( value_buffer_ );
747  assert( values_total_ >= 0 );
748  assert( sample_index < num_samples_ );
749  assert( value_index < values_per_sample_ );
750  assert( sample_index * values_per_sample_ + value_index < static_cast<size_t>( values_total_ ));
751 
752  // We want to get an element from the value buffer that corresponds ot a given sample
753  // (multiply by values per sample to get the correct part of the array), and a given
754  // value index within that sample (add at the end to get to value within the array part).
755  return value_buffer_.get() + sample_index * values_per_sample_ + value_index;
756  }
757 
758  void go_to_next_value_()
759  {
760  // We potentially start at -1 for the start of a new sample, but then move to a proper index.
761  ++value_idx_;
762  assert( value_idx_ >= 0 );
763 
764  // Now find a valid value within the bounds of the data for this sample.
765  while( static_cast<size_t>( value_idx_ ) < values_per_sample_ ) {
766  S* current = get_value_ptr_( sample_idx_, value_idx_ );
767 
768  // If we reach the vector end marker, set our value index to the proper past-the-end index
769  // for the current sample, and get out of here. No valid values in this sample left.
770  if( is_vector_end_( *current )) {
771  value_idx_ = values_per_sample_;
772  break;
773  }
774 
775  // If the value is missing, move to the next instead.
776  if( is_missing_value_( *current )) {
777  ++value_idx_;
778  continue;
779  }
780 
781  // If neither, we found a valid value.
782  break;
783  }
784  assert( value_idx_ >= 0 );
785 
786  // We either end up at a valid value (within bounds, not end-of-vector, not missing),
787  // or we reached the end of the values (either through the iterations by incrementing
788  // due to missing values, or because of the hard set in the vecor end condition above).
789  // We need to test for whether we reached the end first and then rely on
790  // short-circuit-evaluation, so that the valid value test is only conducted
791  // if we are at a valid index!
792  assert(
793  ( static_cast<size_t>( value_idx_ ) >= values_per_sample_ ) ||
794  (
795  test_index_boundaries_( sample_idx_, value_idx_, false ) &&
796  test_valid_value_( sample_idx_, value_idx_, false )
797  )
798  );
799  }
800 
801  bool test_index_boundaries_( size_t sample_index, size_t value_index, bool throwing ) const
802  {
803  // Assert that the values per sample is a non-negative number, so that the omission
804  // of the second param of this function for cases where we just want to check the sample
805  // index still works.
806  // ... always true due to unsigned int...
807  // assert( values_per_sample_ >= 0 );
808 
809  if( sample_index >= num_samples_ ) {
810  if( throwing ) {
811  throw std::invalid_argument(
812  "Cannot get value at sample " + std::to_string(sample_index) + " at index " +
813  std::to_string( value_index ) + ", as there are only " +
814  std::to_string( num_samples_ ) + " samples in the VCF/BCF record."
815  );
816  } else {
817  return false;
818  }
819  }
820  if( value_index >= values_per_sample_ ) {
821  if( throwing ) {
822  throw std::invalid_argument(
823  "Cannot get value at sample " + std::to_string(sample_index) + " at index " +
824  std::to_string( value_index ) + ", as there are only " +
825  std::to_string( values_per_sample_ ) + " values per sample in this VCF/BCF record."
826  );
827  } else {
828  return false;
829  }
830  }
831  return true;
832  }
833 
834  bool test_valid_value_( size_t sample_index, size_t value_index, bool throwing ) const
835  {
836  // Here, we only assert that the indices are valid, as this job has to be done
837  // by the caller independently beforehand.
838  assert( test_index_boundaries_( sample_index, value_index, false ));
839 
840  S* ptr = get_value_ptr_( sample_index, value_index );
841  assert( ptr );
842  if( is_vector_end_( *ptr )) {
843  if( throwing ) {
844  throw std::invalid_argument(
845  "Cannot get value at sample " + std::to_string(sample_index) + " at index " +
847  ", as this value is marked as the vector end for that sample."
848  );
849  } else {
850  return false;
851  }
852  }
853  if( is_missing_value_( *ptr )) {
854  if( throwing ) {
855  throw std::invalid_argument(
856  "Cannot get value at sample " + std::to_string(sample_index) + " at index " +
858  ", as this value is marked as missing for that sample."
859  );
860  } else {
861  return false;
862  }
863  }
864  return true;
865  }
866 
867  // -------------------------------------------------------------------------
868  // Specializations
869  // -------------------------------------------------------------------------
870 
871  // We need a couple of special functions that are different for each data type (char*, int, float)
872  // that we want to process. Their respective specializations are implemented at the end of this
873  // file - this also means that other instanciations will not work out of the box.
874 
889  int construct_values_(
890  ::bcf_hdr_t* header, ::bcf1_t* record, std::string const& id, VcfValueType ht_type
891  );
892 
896  bool is_vector_end_( S val ) const;
897 
901  bool is_missing_value_( S val ) const;
902 
903  // -------------------------------------------------------------------------
904  // Data Members
905  // -------------------------------------------------------------------------
906 
907 private:
908 
909  // Let's keep it easy to detect the end iterator. We initialize to true here,
910  // so that we can simply use a default constructor for the end iterator.
911  bool is_end_ = true;
912 
913  // We keep pointers to the header and the record here, but do not manage them.
914  ::bcf_hdr_t* header_ = nullptr;
915  ::bcf1_t* record_ = nullptr;
916 
917  // Data members needed for htslib functions: value_buffer_ and values_reserved_ are where
918  // the data from the record gets copied to. We use a shared_ptr to manage the lifetime,
919  // in order to not have to bother with this when copies of this iterator are made
920  // (which happens automatically in range-based for loops for example...).
921  // values_total_ is the total number of values stored in the buffer, which should be <= the
922  // reserved size.
923  // TODO find a way to re-use these buffers (maybe store them in VcfRecord instead?!, or
924  // by re-using the same iterator instance?! how about a static map from format ID to buffer,
925  // so that one can iterate multiple formats at the same time?! then, the pointer here would
926  // just need to point to the map value, but not manage it?! not sure how that would need to be)
927  // --> make that a std::unordered_map<std::string [id], struct{ buff, buff_size }> that is
928  // held by VcfRecord and handed over to the iterator on construction!
929  std::shared_ptr<S> value_buffer_ = nullptr;
930  int values_reserved_ = 0;
931  int values_total_ = 0;
932 
933  // Store the total number of samples in the record that we want to iterate over.
934  size_t num_samples_;
935 
936  // Futhermore, values_per_sample_ is the count of values per sample (you don't say?!),
937  // which is computed as the total number of values (returned by the htslib function that gives
938  // us the data in the first place) divided by the number of samples. This is really ugly,
939  // but it seems that's how htslib wants us to do this.
940  size_t values_per_sample_;
941 
942  // Position (in number of samples) of our iteration, and position (in number of values per
943  // sample) within the current sample. We start with -1 for the value index, to make the "go
944  // to next value" function easy to implement without special case for the first value.
945  // As htslib assumes int indices anyway, and as there won't be 2 billion values for a single
946  // sample in vcf, the loss of variable scope should be okay (as we are using int instand of uint).
947  // The value -1 is only ever used in between function calls, and no user should ever be able
948  // to observe this value, e.g., via the value_index() function (which we of course assert).
949  size_t sample_idx_ = 0;
950  int32_t value_idx_ = -1;
951 };
952 
953 // =================================================================================================
954 // Specializations as needed
955 // =================================================================================================
956 
957 // -------------------------------------------------------------------------
958 // construct_values_
959 // -------------------------------------------------------------------------
960 
961 template<>
962 inline int VcfFormatIterator<int32_t, int32_t>::construct_values_(
963  ::bcf_hdr_t* header, ::bcf1_t* record, std::string const& id, VcfValueType ht_type
964 ) {
965  // Get all values, and store the result of the function call, which gives the total
966  // number of values that are written to value_buffer_. We store that as values_total_,
967  // but also return it so that it can be used as an error checking mechanism
968  // (htslib misuses that and encodes errors by negative numbers).
969  // As we use a shared_ptr for lifetime management of the buffer, but htslib expects a plain pointer,
970  // we do this in two steps, transferring ownership to the shared_ptr after the htslib call.
971  // We need a custom deleter for the shared_ptr, as C++11 does not support delete[] for this
972  // out of the box (see https://stackoverflow.com/a/13062069/4184258).
973  assert( static_cast<int>( ht_type ) == static_cast<int>( VcfValueType::kInteger ));
974  int32_t* tmp_ptr = nullptr;
975  values_total_ = VcfFormatHelper::bcf_get_format_values_(
976  header, record, id.c_str(), reinterpret_cast<void**>( &tmp_ptr ),
977  &values_reserved_, static_cast<int>( ht_type )
978  );
979  value_buffer_ = std::shared_ptr<int32_t>( tmp_ptr, std::default_delete<int32_t[]>());
980  return values_total_;
981 }
982 
983 template<>
984 inline int VcfFormatIterator<float, double>::construct_values_(
985  ::bcf_hdr_t* header, ::bcf1_t* record, std::string const& id, VcfValueType ht_type
986 ) {
987  // See above <int32_t, int32_t> for an explanation. Here, it's the same, just for float.
988  assert( static_cast<int>( ht_type ) == static_cast<int>( VcfValueType::kFloat ));
989  float* tmp_ptr = nullptr;
990  values_total_ = VcfFormatHelper::bcf_get_format_values_(
991  header, record, id.c_str(), reinterpret_cast<void**>( &tmp_ptr ),
992  &values_reserved_, static_cast<int>( ht_type )
993  );
994  value_buffer_ = std::shared_ptr<float>( tmp_ptr, std::default_delete<float[]>());
995  return values_total_;
996 }
997 
998 template<>
999 inline int VcfFormatIterator<char*, std::string>::construct_values_(
1000  ::bcf_hdr_t* header, ::bcf1_t* record, std::string const& id, VcfValueType ht_type
1001 ) {
1002  // See above <int32_t, int32_t> for the basic explanation of what we do here.
1003  // For char*/string we however need to call a different bcf_get_format_*()
1004  // function, and need two deletion steps to free the char* as well as the char** memory,
1005  // so we use a custom deleter here that does both.
1006  (void) ht_type;
1007  assert( static_cast<int>( ht_type ) == static_cast<int>( VcfValueType::kString ));
1008  char** tmp_ptr = nullptr;
1009  int const res = VcfFormatHelper::bcf_get_format_string_(
1010  header, record, id.c_str(), &tmp_ptr, &values_reserved_
1011  );
1012  value_buffer_ = std::shared_ptr<char*>( tmp_ptr, [](char** ptr){
1013  free(ptr[0]);
1014  free(ptr);
1015  });
1016 
1017  // The above htslib call returns the number of chars for the longest string in the data
1018  // (plus some extra), but not the number of total values. That implies that we cannot have
1019  // mutliple string values for one sample (which seems not to be documented in htslib anywhere,
1020  // but seems to be just assumed). Also, that means that we have to manually set the value total
1021  // to a useful value, which for our purposes has to be the number of samples, so that
1022  // our division to get values_per_sample_ works out properly.
1023 
1024  // We use our wrapper VcfFormatHelper::bcf_hdr_nsamples_ here, which wraps bcf_hdr_nsamples()
1025  // so that we do not have to include htslib here.
1026  values_total_ = VcfFormatHelper::bcf_hdr_nsamples_(header);
1027 
1028  // Now let's return our htslib function call result, which is only used to test that it is
1029  // not negative (which would indicate an error).
1030  return res;
1031 }
1032 
1033 template<>
1034 inline int VcfFormatIterator<int32_t, VcfGenotype>::construct_values_(
1035  ::bcf_hdr_t* header, ::bcf1_t* record, std::string const& id, VcfValueType ht_type
1036 ) {
1037  // This class is supposed to be only instanciated from the VcfFormatHelper iterator begin/end functions.
1038  // Hence, we here simply assert that the provided values are correct.
1039  // if( id != "GT" ) {
1040  // throw std::invalid_argument(
1041  // "Can only create an instance of VcfFormatIterator that is using VcfGenotype " +
1042  // "for the FORMAT key/id \"GT\", but \"" + id + "\" was provided instead."
1043  // );
1044  // }
1045  // if( ht_type != VcfValueType.kInt ) {
1046  // throw std::invalid_argument(
1047  // "Can only create an instance of VcfFormatIterator for value type " +
1048  // vcf_value_type_to_string( VcfValueType.kInt ) ", but " +
1049  // vcf_value_type_to_string( ht_type ) + " was provided instead."
1050  // );
1051  // }
1052  (void) id;
1053  (void) ht_type;
1054  assert( id == "GT" );
1055  assert( static_cast<int>( ht_type ) == static_cast<int>( VcfValueType::kInteger ));
1056 
1057  // See above <int32_t, int32_t> for an explanation. Here, it's the same, but we later convert
1058  // to our VcfGenotype instance instead (implicitly, when returning a value).
1059  int32_t* tmp_ptr = nullptr;
1060  values_total_ = VcfFormatHelper::bcf_get_genotypes_(
1061  header, record, reinterpret_cast<void**>( &tmp_ptr ), &values_reserved_
1062  );
1063  value_buffer_ = std::shared_ptr<int32_t>( tmp_ptr, std::default_delete<int32_t[]>());
1064  return values_total_;
1065 }
1066 
1067 // -------------------------------------------------------------------------
1068 // is_vector_end_
1069 // -------------------------------------------------------------------------
1070 
1071 template<>
1072 inline bool VcfFormatIterator<int32_t, int32_t>::is_vector_end_( int32_t val ) const
1073 {
1074  return val == VcfFormatHelper::bcf_int32_vector_end_;
1075 }
1076 
1077 template<>
1078 inline bool VcfFormatIterator<float, double>::is_vector_end_( float val ) const
1079 {
1080  return VcfFormatHelper::bcf_float_is_vector_end_( val );
1081 }
1082 
1083 template<>
1084 inline bool VcfFormatIterator<char*, std::string>::is_vector_end_( char* val ) const
1085 {
1086  assert( val );
1087  return *val == VcfFormatHelper::bcf_str_vector_end_;
1088 }
1089 
1090 template<>
1091 inline bool VcfFormatIterator<int32_t, VcfGenotype>::is_vector_end_( int32_t val ) const
1092 {
1093  return val == VcfFormatHelper::bcf_int32_vector_end_;
1094 }
1095 
1096 // -------------------------------------------------------------------------
1097 // is_missing_value_
1098 // -------------------------------------------------------------------------
1099 
1100 template<>
1101 inline bool VcfFormatIterator<int32_t, int32_t>::is_missing_value_( int32_t val ) const
1102 {
1103  return val == VcfFormatHelper::bcf_int32_missing_;
1104 }
1105 
1106 template<>
1107 inline bool VcfFormatIterator<float, double>::is_missing_value_( float val ) const
1108 {
1109  return VcfFormatHelper::bcf_float_is_missing_( val );
1110 }
1111 
1112 template<>
1113 inline bool VcfFormatIterator<char*, std::string>::is_missing_value_( char* val ) const
1114 {
1115  assert( val );
1116  return *val == VcfFormatHelper::bcf_str_missing_;
1117 }
1118 
1119 template<>
1120 inline bool VcfFormatIterator<int32_t, VcfGenotype>::is_missing_value_( int32_t val ) const
1121 {
1122  return val == VcfFormatHelper::bcf_int32_missing_;
1123 }
1124 
1125 } // namespace population
1126 } // namespace genesis
1127 
1128 #endif // htslib guard
1129 #endif // include guard
genesis::population::VcfFormatIterator::get_value
T get_value() const
Get the value where the iterator currently resides.
Definition: vcf_format_iterator.hpp:557
genesis::population::VcfFormatIterator::value_index
size_t value_index() const
Return the index of the current value within the current sample.
Definition: vcf_format_iterator.hpp:506
genesis::population::VcfFormatIterator::sample_name
std::string sample_name() const
Return the name of the current sample, as given in the #CHROM ... header line of the VCF file.
Definition: vcf_format_iterator.hpp:516
genesis::population::VcfFormatIterator::operator*
self_type & operator*()
Dereference, which gives the iterator itself instead of the value, as our values should be accessed v...
Definition: vcf_format_iterator.hpp:420
genesis::population::VcfFormatIterator::sample_count
size_t sample_count() const
Return the total number of samples that we are iterating over.
Definition: vcf_format_iterator.hpp:395
genesis::population::VcfFormatIterator::VcfFormatIterator
VcfFormatIterator(::bcf_hdr_t *header, ::bcf1_t *record, std::string const &id, VcfValueType ht_type)
Create an instance, given the htslib header, record line, and the FORMAT id tag/key (as well as its d...
Definition: vcf_format_iterator.hpp:307
genesis::population::VcfValueType::kString
@ kString
genesis::population::VcfFormatIterator::get_value_at
T get_value_at(size_t sample_index, size_t value_index) const
Get the value at a given value_index of a given sample at sample_index.
Definition: vcf_format_iterator.hpp:655
genesis::population::VcfValueType::kFloat
@ kFloat
vcf_header.hpp
genesis::population::VcfFormatIterator::sample_name_at
std::string sample_name_at(size_t sample_index) const
Return the sample name at a given index within 0 and sample_count().
Definition: vcf_format_iterator.hpp:603
genesis::population::VcfFormatIterator::operator!=
bool operator!=(self_type const &other) const
Inequality comparison, needed to detect the end of the iteration.
Definition: vcf_format_iterator.hpp:446
genesis::population::VcfFormatIterator::~VcfFormatIterator
~VcfFormatIterator()=default
genesis::population::VcfFormatIterator::operator++
self_type & operator++()
Pre-increment operator to move to the next sample.
Definition: vcf_format_iterator.hpp:457
genesis::population::VcfFormatIterator::values_per_sample
size_t values_per_sample() const
Return the number of values that each sample has.
Definition: vcf_format_iterator.hpp:407
genesis::population::VcfFormatHelper
Provide htslib helper functions.
Definition: vcf_format_iterator.hpp:85
genesis::population::VcfFormatIterator::get_values
std::vector< T > get_values(bool include_missing=false) const
Get a vector of all values for the current sample.
Definition: vcf_format_iterator.hpp:682
genesis::population::VcfFormatIterator
Iterate the FORMAT information for the samples in a SNP/variant line in a VCF/BCF file.
Definition: vcf_format_iterator.hpp:62
vcf_common.hpp
genesis::population::VcfFormatIterator::has_value_at
bool has_value_at(size_t value_index) const
Return whether the value at a given index within the current sample is valid.
Definition: vcf_format_iterator.hpp:623
genesis::population::VcfValueType::kInteger
@ kInteger
genesis::population::VcfFormatIterator::operator=
VcfFormatIterator & operator=(VcfFormatIterator const &)=default
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VcfFormatIterator::sample_index
size_t sample_index() const
Return the index of the column of the current sample.
Definition: vcf_format_iterator.hpp:496
genesis::population::VcfHeaderLine::kFormat
@ kFormat
genesis::population::VcfFormatIterator::operator==
bool operator==(self_type const &other) const
Equality comparison, needed to detect the end of the iteration.
Definition: vcf_format_iterator.hpp:428
genesis::population::VcfFormatIterator::next_value
void next_value()
Move to the next value within the current sample.
Definition: vcf_format_iterator.hpp:579
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::population::VcfValueType
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
Definition: vcf_common.hpp:78
genesis::population::VcfFormatIterator::valid_value_count
size_t valid_value_count() const
Return the number of valid values for the current sample.
Definition: vcf_format_iterator.hpp:591
genesis::population::VcfFormatIterator::has_value
bool has_value() const
Return whether the iterator currently resides at a valid value of the current sample.
Definition: vcf_format_iterator.hpp:539
genesis::population::VcfFormatIterator::get_value_at
T get_value_at(size_t value_index) const
Get the value at a given value_index of the current sample.
Definition: vcf_format_iterator.hpp:647
genesis::population::VcfFormatIterator::header_data
::bcf_hdr_t * header_data()
Get the raw htslib structure pointer for the header.
Definition: vcf_format_iterator.hpp:379
genesis::population::VcfFormatIterator::valid_value_count_at
size_t valid_value_count_at(size_t sample_index) const
Return the number of valid values for a given sample_index.
Definition: vcf_format_iterator.hpp:726
genesis::population::VcfFormatIterator::VcfFormatIterator
VcfFormatIterator()=default
Create a default (empty) instance, that is used to indicate the end iterator position.
genesis::population::VcfFormatIterator::has_value_at
bool has_value_at(size_t sample_index, size_t value_index) const
Return whether the value at a given index within the given sample is valid.
Definition: vcf_format_iterator.hpp:634
genesis::population::VcfFormatIterator::get_values_at
std::vector< T > get_values_at(size_t sample_index, bool include_missing=false) const
Get a vector of all values for a given sample.
Definition: vcf_format_iterator.hpp:692
genesis::population::VcfFormatIterator::record_data
::bcf1_t * record_data()
Get the raw htslib structure pointer for the record/line.
Definition: vcf_format_iterator.hpp:387