1 #ifndef GENESIS_POPULATION_FORMAT_VCF_FORMAT_ITERATOR_H_
2 #define GENESIS_POPULATION_FORMAT_VCF_FORMAT_ITERATOR_H_
55 namespace population {
61 template<
typename S,
typename T >
92 template<
typename S,
typename T >
97 static const int8_t bcf_int8_vector_end_ = (-127);
98 static const int16_t bcf_int16_vector_end_ = (-32767);
99 static const int32_t bcf_int32_vector_end_ = (-2147483647);
100 static const int64_t bcf_int64_vector_end_ = (-9223372036854775807LL);
101 static const char bcf_str_vector_end_ = 0;
102 static const int8_t bcf_int8_missing_ = (-128);
103 static const int16_t bcf_int16_missing_ = (-32767-1);
104 static const int32_t bcf_int32_missing_ = (-2147483647-1);
105 static const int64_t bcf_int64_missing_ = (-9223372036854775807LL - 1LL);
106 static const char bcf_str_missing_ = 0x07;
113 static int32_t bcf_hdr_nsamples_( ::bcf_hdr_t
const* header );
120 static std::string hdr_sample_name_( ::bcf_hdr_t
const* header,
size_t index );
126 static int bcf_get_format_string_(
127 const bcf_hdr_t *hdr, bcf1_t *line,
const char *tag,
char ***dst,
int *ndst
134 static int bcf_get_format_values_(
135 const bcf_hdr_t *hdr, bcf1_t *line,
const char *tag,
void **dst,
int *ndst,
int type
142 static int bcf_get_genotypes_(
const bcf_hdr_t *hdr, bcf1_t *line,
void **dst,
int *ndst );
148 static int bcf_float_is_vector_end_(
float f);
154 static int bcf_float_is_missing_(
float f);
276 template<
typename S,
typename T >
321 auto const res = construct_values_( header_, record_,
id, ht_type );
328 VcfHeader::check_value_return_code_(
334 assert( value_buffer_ );
336 assert( values_total_ >= 0 );
338 assert( values_total_ <= values_reserved_ );
353 num_samples_ = VcfFormatHelper::bcf_hdr_nsamples_(header);
354 assert(
static_cast<size_t>( values_total_ ) % num_samples_ == 0 );
355 values_per_sample_ =
static_cast<size_t>( values_total_ ) / num_samples_;
356 assert( values_per_sample_ * num_samples_ ==
static_cast<size_t>( values_total_ ));
359 assert( value_idx_ == -1 );
361 assert( value_idx_ >= 0 );
409 return values_per_sample_;
434 return ( is_end_ && other.is_end_ ) || (
435 !is_end_ && !other.is_end_ &&
436 header_ == other.header_ && record_ == other.record_ &&
437 values_total_ == other.values_total_ && num_samples_ == other.num_samples_ &&
438 values_per_sample_ == other.values_per_sample_ &&
439 sample_idx_ == other.sample_idx_ && value_idx_ == other.value_idx_
448 return !( *
this == other );
460 if( sample_idx_ < num_samples_ ) {
465 assert( value_idx_ >= 0 );
508 assert( value_idx_ >= 0 );
520 assert(
static_cast<int32_t
>( num_samples_ ) == VcfFormatHelper::bcf_hdr_nsamples_( header_ ));
521 assert( sample_idx_ < num_samples_ );
525 return VcfFormatHelper::hdr_sample_name_( header_, sample_idx_ );
544 assert( value_idx_ >= 0 );
546 test_index_boundaries_( sample_idx_, value_idx_,
false ) &&
547 test_valid_value_( sample_idx_, value_idx_,
false )
563 assert( value_idx_ >= 0 );
564 assert( test_index_boundaries_( sample_idx_, value_idx_,
false ));
565 assert( test_valid_value_( sample_idx_, value_idx_,
false ));
570 return T{ *get_value_ptr_( sample_idx_, value_idx_ ) };
608 assert(
static_cast<int32_t
>( num_samples_ ) == VcfFormatHelper::bcf_hdr_nsamples_( header_ ));
613 return VcfFormatHelper::hdr_sample_name_( header_,
sample_index );
641 return (! is_vector_end_( *ptr )) && (! is_missing_value_( *ptr ));
682 std::vector<T>
get_values(
bool include_missing =
false )
const
697 assert(
static_cast<int32_t
>( num_samples_ ) == VcfFormatHelper::bcf_hdr_nsamples_( header_ ));
700 std::vector<T> result;
701 if( include_missing ) {
702 result.reserve( values_per_sample_ );
703 for(
size_t i = 0; i < values_per_sample_; ++i ) {
705 result.emplace_back( *ptr );
709 result.reserve( cnt );
710 for(
size_t i = 0; i < values_per_sample_; ++i ) {
713 result.emplace_back( *ptr );
729 for(
size_t i = 0; i < values_per_sample_; ++i ) {
746 assert( value_buffer_ );
747 assert( values_total_ >= 0 );
758 void go_to_next_value_()
762 assert( value_idx_ >= 0 );
765 while(
static_cast<size_t>( value_idx_ ) < values_per_sample_ ) {
766 S* current = get_value_ptr_( sample_idx_, value_idx_ );
770 if( is_vector_end_( *current )) {
771 value_idx_ = values_per_sample_;
776 if( is_missing_value_( *current )) {
784 assert( value_idx_ >= 0 );
793 (
static_cast<size_t>( value_idx_ ) >= values_per_sample_ ) ||
795 test_index_boundaries_( sample_idx_, value_idx_,
false ) &&
796 test_valid_value_( sample_idx_, value_idx_,
false )
811 throw std::invalid_argument(
814 std::to_string( num_samples_ ) +
" samples in the VCF/BCF record."
822 throw std::invalid_argument(
825 std::to_string( values_per_sample_ ) +
" values per sample in this VCF/BCF record."
842 if( is_vector_end_( *ptr )) {
844 throw std::invalid_argument(
847 ", as this value is marked as the vector end for that sample."
853 if( is_missing_value_( *ptr )) {
855 throw std::invalid_argument(
858 ", as this value is marked as missing for that sample."
889 int construct_values_(
890 ::bcf_hdr_t* header, ::bcf1_t* record, std::string
const&
id,
VcfValueType ht_type
896 bool is_vector_end_( S val )
const;
901 bool is_missing_value_( S val )
const;
914 ::bcf_hdr_t* header_ =
nullptr;
915 ::bcf1_t* record_ =
nullptr;
929 std::shared_ptr<S> value_buffer_ =
nullptr;
930 int values_reserved_ = 0;
931 int values_total_ = 0;
940 size_t values_per_sample_;
949 size_t sample_idx_ = 0;
950 int32_t value_idx_ = -1;
962 inline int VcfFormatIterator<int32_t, int32_t>::construct_values_(
963 ::bcf_hdr_t* header, ::bcf1_t* record, std::string
const&
id,
VcfValueType ht_type
974 int32_t* tmp_ptr =
nullptr;
975 values_total_ = VcfFormatHelper::bcf_get_format_values_(
976 header, record,
id.c_str(),
reinterpret_cast<void**
>( &tmp_ptr ),
977 &values_reserved_,
static_cast<int>( ht_type )
981 value_buffer_ = std::shared_ptr<int32_t>( tmp_ptr, [](int32_t* p) { ::free(p); });
982 return values_total_;
986 inline int VcfFormatIterator<float, double>::construct_values_(
987 ::bcf_hdr_t* header, ::bcf1_t* record, std::string
const&
id,
VcfValueType ht_type
991 float* tmp_ptr =
nullptr;
992 values_total_ = VcfFormatHelper::bcf_get_format_values_(
993 header, record,
id.c_str(),
reinterpret_cast<void**
>( &tmp_ptr ),
994 &values_reserved_,
static_cast<int>( ht_type )
996 value_buffer_ = std::shared_ptr<float>( tmp_ptr, [](
float* p) { ::free(p); });
997 return values_total_;
1001 inline int VcfFormatIterator<char*, std::string>::construct_values_(
1002 ::bcf_hdr_t* header, ::bcf1_t* record, std::string
const&
id,
VcfValueType ht_type
1010 char** tmp_ptr =
nullptr;
1011 int const res = VcfFormatHelper::bcf_get_format_string_(
1012 header, record,
id.c_str(), &tmp_ptr, &values_reserved_
1014 value_buffer_ = std::shared_ptr<char*>( tmp_ptr, [](
char** ptr){
1028 values_total_ = VcfFormatHelper::bcf_hdr_nsamples_(header);
1036 inline int VcfFormatIterator<int32_t, VcfGenotype>::construct_values_(
1037 ::bcf_hdr_t* header, ::bcf1_t* record, std::string
const&
id,
VcfValueType ht_type
1056 assert(
id ==
"GT" );
1061 int32_t* tmp_ptr =
nullptr;
1062 values_total_ = VcfFormatHelper::bcf_get_genotypes_(
1063 header, record,
reinterpret_cast<void**
>( &tmp_ptr ), &values_reserved_
1065 value_buffer_ = std::shared_ptr<int32_t>( tmp_ptr, [](int32_t* p) { ::free(p); });
1066 return values_total_;
1074 inline bool VcfFormatIterator<int32_t, int32_t>::is_vector_end_( int32_t val )
const
1076 return val == VcfFormatHelper::bcf_int32_vector_end_;
1080 inline bool VcfFormatIterator<float, double>::is_vector_end_(
float val )
const
1082 return VcfFormatHelper::bcf_float_is_vector_end_( val );
1086 inline bool VcfFormatIterator<char*, std::string>::is_vector_end_(
char* val )
const
1089 return *val == VcfFormatHelper::bcf_str_vector_end_;
1093 inline bool VcfFormatIterator<int32_t, VcfGenotype>::is_vector_end_( int32_t val )
const
1095 return val == VcfFormatHelper::bcf_int32_vector_end_;
1103 inline bool VcfFormatIterator<int32_t, int32_t>::is_missing_value_( int32_t val )
const
1105 return val == VcfFormatHelper::bcf_int32_missing_;
1109 inline bool VcfFormatIterator<float, double>::is_missing_value_(
float val )
const
1111 return VcfFormatHelper::bcf_float_is_missing_( val );
1115 inline bool VcfFormatIterator<char*, std::string>::is_missing_value_(
char* val )
const
1118 return *val == VcfFormatHelper::bcf_str_missing_;
1122 inline bool VcfFormatIterator<int32_t, VcfGenotype>::is_missing_value_( int32_t val )
const
1124 return val == VcfFormatHelper::bcf_int32_missing_;
1130 #endif // htslib guard
1131 #endif // include guard