44 #include <htslib/cram.h>
45 #include <htslib/hts.h>
46 #include <htslib/khash.h>
47 #include <htslib/kstring.h>
48 #include <htslib/sam.h>
52 namespace population {
58 struct SamVariantInputStream::SamFileHandle
74 ::htsFile* hts_file_ =
nullptr;
77 ::sam_hdr_t* sam_hdr_ =
nullptr;
88 std::unordered_map<std::string, size_t> rg_tags_;
93 std::vector<std::string> target_sample_names_;
101 size_t target_sample_count_;
123 std::vector<std::string> get_header_rg_tags_()
const;
146 static int read_sam_(
void* data, bam1_t* b );
155 static int pileup_cd_create_(
void* data, bam1_t
const* b, bam_pileup_cd* cd );
162 static int pileup_cd_destroy_(
void* data, bam1_t
const* b, bam_pileup_cd* cd );
174 void SamVariantInputStream::SamFileHandle::init_( SamVariantInputStream
const* parent )
185 assert( ! parent_->input_file_.empty() );
191 hts_file_ = hts_open( parent_->input_file_.c_str(),
"r" );
193 throw std::runtime_error(
"Cannot open file " + parent_->input_file_ );
195 sam_hdr_ = sam_hdr_read( hts_file_ );
197 throw std::runtime_error(
"Cannot read header of file " + parent_->input_file_ );
203 iter_ = bam_plp_init(
205 static_cast<void*
>(
this )
208 throw std::runtime_error(
"Cannot initialize to traverse file " + parent_->input_file_ );
210 if( parent_->max_acc_depth_ > 0 ) {
211 bam_plp_set_maxcnt( iter_, parent_->max_acc_depth_ );
232 target_sample_names_.clear();
233 target_sample_count_ = 0;
238 if( ! parent_->split_by_rg_ ) {
242 ! parent_->rg_tag_filter_.empty() ||
243 parent_->inverse_rg_tag_filter_ ||
244 parent_->with_unaccounted_rg_
246 throw std::runtime_error(
247 "Input settings for filtering samples based on their RG tag are set in the "
248 "SAM/BAM/CRAM reader, but the RG tag splitting is not activated in the reader."
253 target_sample_count_ = 1;
265 auto tags = get_header_rg_tags_();
266 auto filter_tags_copy = parent->rg_tag_filter_;
267 for(
auto& tag : tags ) {
274 filter_tags_copy.erase( tag );
284 ( parent_->rg_tag_filter_.empty() ) ||
285 ( ! parent_->inverse_rg_tag_filter_ && parent_->rg_tag_filter_.count( tag ) > 0 ) ||
286 ( parent_->inverse_rg_tag_filter_ && parent_->rg_tag_filter_.count( tag ) == 0 )
292 rg_tags_.emplace( tag, target_sample_count_ );
293 target_sample_names_.emplace_back( std::move( tag ));
294 ++target_sample_count_;
299 rg_tags_.emplace( std::move( tag ), std::numeric_limits<size_t>::max() );
305 if( filter_tags_copy.size() > 0 ) {
309 tags = get_header_rg_tags_();
310 std::string hd_tags_msg;
312 hd_tags_msg =
" Header does not contain any RG tags; there can hence be no filtering.";
314 hd_tags_msg =
" First @RG tag that appears in the header: \"" + (*tags.begin()) +
"\".";
318 assert( ! filter_tags_copy.empty() );
319 throw std::runtime_error(
320 "Invalid list of @RG read group tags given for filtering the SAM/BAM/CRAM file, "
321 "which do not occur in the @RG list in the header of the file." + hd_tags_msg +
322 " First offending RG tag that appears in the given filter list, "
323 "but not in the header: \"" + (*filter_tags_copy.begin()) +
"\"."
329 assert( rg_tags_.size() == tags.size() );
330 assert( parent_->split_by_rg_ );
333 if( parent_->with_unaccounted_rg_ ) {
334 target_sample_names_.emplace_back(
"unaccounted" );
335 ++target_sample_count_;
337 assert( target_sample_names_.size() == target_sample_count_ );
350 bam_plp_constructor( iter_, pileup_cd_create_ );
358 std::vector<std::string> SamVariantInputStream::SamFileHandle::get_header_rg_tags_()
const
365 std::vector<std::string> result;
368 auto const n_rg = sam_hdr_count_lines( sam_hdr_,
"RG" );
370 throw std::runtime_error(
371 "Failed to get @RG ID tags in file " + parent_->input_file_ +
372 ". Cannot split by RG read group tags."
377 kstring_t id_val = KS_INITIALIZE;
378 for(
size_t i = 0; i < static_cast<size_t>( n_rg ); ++i ) {
381 if( sam_hdr_find_tag_pos( sam_hdr_,
"RG", i,
"ID", &id_val ) < 0 ) {
383 throw std::runtime_error(
384 "Failed to get @RG ID tags in file " + parent_->input_file_
390 char* tag = ks_release( &id_val );
391 result.emplace_back( tag );
402 SamVariantInputStream::SamFileHandle::~SamFileHandle()
414 bam_plp_destroy( iter_ );
418 sam_hdr_destroy( sam_hdr_ );
422 hts_close( hts_file_ );
436 int SamVariantInputStream::SamFileHandle::read_sam_(
437 void* data, bam1_t* bam
445 auto handle =
static_cast<SamVariantInputStream::SamFileHandle*
>( data );
447 assert( handle->parent_ );
448 assert( handle->hts_file_ );
449 assert( handle->sam_hdr_ );
464 ret = sam_read1( handle->hts_file_, handle->sam_hdr_, bam );
469 throw std::runtime_error(
"Error reading file " + handle->parent_->input_file() );
473 auto const& flags_in_all = handle->parent_->flags_include_all_;
474 auto const& flags_in_any = handle->parent_->flags_include_any_;
475 auto const& flags_ex_all = handle->parent_->flags_exclude_all_;
476 auto const& flags_ex_any = handle->parent_->flags_exclude_any_;
479 if( flags_in_all && (( bam->core.flag & flags_in_all ) != flags_in_all )) {
489 if( flags_in_any && (( bam->core.flag & flags_in_any ) == 0 )) {
500 if( flags_ex_all && (( bam->core.flag & flags_ex_all ) == flags_ex_all )) {
513 if( flags_ex_any && (( bam->core.flag & flags_ex_any ) != 0 )) {
525 if(
static_cast<int>( bam->core.qual ) < handle->parent_->min_map_qual_ ) {
538 int SamVariantInputStream::SamFileHandle::pileup_cd_create_(
539 void* data, bam1_t
const* b, bam_pileup_cd* cd
542 auto handle =
static_cast<SamVariantInputStream::SamFileHandle*
>( data );
546 assert( handle->parent_ );
547 assert( handle->parent_->split_by_rg_ );
561 auto tag_itr = handle->rg_tags_.end();
566 uint8_t* tag = bam_aux_get( b,
"RG" );
567 if( tag !=
nullptr ) {
574 char* rg = bam_aux2Z( tag );
575 tag_itr = handle->rg_tags_.find( std::string( rg ));
590 if( tag_itr != handle->rg_tags_.end() ) {
599 smp_idx = tag_itr->second;
600 if( handle->parent_->with_unaccounted_rg_ ) {
602 smp_idx < handle->target_sample_count_ - 1 ||
603 smp_idx == std::numeric_limits<size_t>::max()
607 smp_idx < handle->target_sample_count_ ||
608 smp_idx == std::numeric_limits<size_t>::max()
614 if( handle->parent_->with_unaccounted_rg_ ) {
618 assert( handle->target_sample_count_ > 0 );
619 smp_idx = handle->target_sample_count_ - 1;
623 smp_idx = std::numeric_limits<size_t>::max();
631 smp_idx < handle->target_sample_count_ ||
632 smp_idx == std::numeric_limits<size_t>::max()
634 if( smp_idx == std::numeric_limits<size_t>::max() ) {
648 int SamVariantInputStream::SamFileHandle::pileup_cd_destroy_(
649 void* data, bam1_t
const* b, bam_pileup_cd* cd
667 SamVariantInputStream::Iterator::Iterator( SamVariantInputStream
const* parent )
669 , handle_( std::make_shared<SamFileHandle>() )
677 assert( strcmp( seq_nt16_str,
"=ACMGRSVTWYHKDBN" ) == 0 );
682 if( parent_->input_file_.empty() ) {
688 handle_->init_( parent_ );
698 std::vector<std::string> SamVariantInputStream::Iterator::rg_tags(
bool all_header_tags )
const
712 if( all_header_tags ) {
721 return handle_->get_header_rg_tags_();
725 return handle_->target_sample_names_;
732 size_t SamVariantInputStream::Iterator::sample_size()
const
735 return handle_->target_sample_count_;
746 void SamVariantInputStream::Iterator::increment_()
751 assert( handle_->parent_ );
752 assert( handle_->sam_hdr_ );
753 assert( handle_->iter_ );
759 bam_pileup1_t
const* plp;
761 plp = bam_plp_auto( handle_->iter_, &tid, &pos, &n );
771 if( plp ==
nullptr ) {
775 if( tid < 0 || n < 0 ) {
781 parent_->region_filter_ &&
782 ! parent_->region_filter_->is_covered(
783 std::string( handle_->sam_hdr_->target_name[tid] ),
784 static_cast<size_t>( pos ) + 1
791 if( parent_->min_depth_ && n < parent_->min_depth_ ) {
794 if( parent_->max_depth_ && n > parent_->max_depth_ ) {
807 (
static_cast<size_t>( pos ) + 1 > current_variant_.position ) ||
808 ( std::string( handle_->sam_hdr_->target_name[tid] ) != current_variant_.chromosome )
812 current_variant_.chromosome = std::string( handle_->sam_hdr_->target_name[tid] );
813 current_variant_.position =
static_cast<size_t>( pos ) + 1;
814 current_variant_.reference_base =
'N';
815 current_variant_.alternative_base =
'N';
816 current_variant_.status.reset();
824 current_variant_.samples.resize( handle_->target_sample_count_ );
825 for(
auto& sample : current_variant_.samples ) {
826 sample = SampleCounts();
833 for(
int i = 0; i < n; ++i ) {
834 bam_pileup1_t
const* p = plp + i;
843 void SamVariantInputStream::Iterator::process_base_( bam_pileup1_t
const* p )
850 int const qual = p->qpos < p->b->core.l_qseq
851 ? bam_get_qual(p->b)[p->qpos]
854 if( qual < parent_->min_base_qual_ ) {
860 auto const smp_idx = get_sample_index_( p );
861 if( smp_idx == std::numeric_limits<size_t>::max() ) {
866 assert( current_variant_.samples.size() == handle_->target_sample_count_ );
867 assert( smp_idx < current_variant_.samples.size() );
868 assert( handle_->target_sample_count_ == current_variant_.samples.size() );
869 auto& sample = current_variant_.samples[smp_idx];
872 if( p->is_del || p->is_refskip ){
879 uint8_t* seq = bam_get_seq (p->b );
880 uint8_t nuc = bam_seqi( seq, p->qpos );
903 throw std::runtime_error(
904 "Invalid base in sam/bam/cram file " + parent_->input_file_ +
905 " at " + current_variant_.chromosome +
":" +
907 std::string( 1, seq_nt16_str[nuc] ) +
", but expected [ACGTN]."
917 size_t SamVariantInputStream::Iterator::get_sample_index_( bam_pileup1_t
const* p )
const
924 if( ! parent_->split_by_rg_ ) {
929 assert( parent_->split_by_rg_ );
933 auto const index = p->cd.i;
935 assert( index == -1 );
936 return std::numeric_limits<size_t>::max();
938 assert(
static_cast<size_t>( index ) < handle_->target_sample_count_ );
939 return static_cast<size_t>( index );
947 SamVariantInputStream::SamVariantInputStream(
948 std::string
const& infile,
949 std::unordered_set<std::string>
const& rg_tag_filter,
950 bool inverse_rg_tag_filter
953 input_file( infile );
960 rg_tag_filter_ = rg_tag_filter;
961 inverse_rg_tag_filter_ = inverse_rg_tag_filter;
987 #endif // htslib guard