A library for working with phylogenetic and population genetic data.
v0.32.0
sam_variant_input_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
31 #ifdef GENESIS_HTSLIB
32 
34 
36 
37 #include <cassert>
38 #include <cstdint>
39 #include <cstring>
40 #include <limits>
41 #include <stdexcept>
42 
43 extern "C" {
44  #include <htslib/cram.h>
45  #include <htslib/hts.h>
46  #include <htslib/khash.h>
47  #include <htslib/kstring.h>
48  #include <htslib/sam.h>
49 }
50 
51 namespace genesis {
52 namespace population {
53 
54 // =================================================================================================
55 // Sam File Handle
56 // =================================================================================================
57 
58 struct SamVariantInputStream::SamFileHandle
59 {
60 public:
61 
62  // This whole class is private to its parent class (and the iterator).
63  // To keep variable naming in line, we hence use underscores for all variables here,
64  // despite them being public in the context of this class itself.
65 
66  // -------------------------------------------------------------------------
67  // Data Members
68  // -------------------------------------------------------------------------
69 
70  // Our main class, for access to settings.
71  SamVariantInputStream const* parent_;
72 
73  // File handle.
74  ::htsFile* hts_file_ = nullptr;
75 
76  // File header.
77  ::sam_hdr_t* sam_hdr_ = nullptr;
78 
79  // Current iterator. The `bam_plp_t` type is in fact a pointer. Messy htslib.
80  bam_plp_t iter_ = nullptr;
81  // ::hts_itr_t* hts_iter = nullptr;
82 
83  // List of the @RG read group tags as present in the header, filtered by any potential
84  // rg_tag_filter() and inverse_rg_tag_filter() settings. That is, this maps from
85  // RG tags to their index of the SampleCounts object in the Variant that is produced
86  // by this iterator at each position. We use a map in this direction for speed,
87  // as we get the RG tag name from htslib for each read.
88  std::unordered_map<std::string, size_t> rg_tags_;
89 
90  // We additionally keep a plain list around, just for the iterator to access.
91  // This list might include the "unaccounted" sample as well as a string, which is not part
92  // of the above map. Hence, this list here is purely meant for user output.
93  std::vector<std::string> target_sample_names_;
94 
95  // We furthermore store the number of SampleCounts samples needed in each Variant,
96  // as a result of the rg_tag splitting. With no splitting, this is 1, as we then only
97  // ever produce one sample, containing the base counts of all reads, independently of
98  // their RG tag.
99  // With splitting, this depends on rg_tag_filter() and with_unaccounted_rg() settings.
100  // See init_() for details.
101  size_t target_sample_count_;
102 
103  // -------------------------------------------------------------------------
104  // Internal Members
105  // -------------------------------------------------------------------------
106 
118  void init_( SamVariantInputStream const* parent );
119 
123  std::vector<std::string> get_header_rg_tags_() const;
124 
133  ~SamFileHandle();
134 
135  // -------------------------------------------------------------------------
136  // Static Callbacks
137  // -------------------------------------------------------------------------
138 
146  static int read_sam_( void* data, bam1_t* b );
147 
155  static int pileup_cd_create_( void* data, bam1_t const* b, bam_pileup_cd* cd );
156 
162  static int pileup_cd_destroy_( void* data, bam1_t const* b, bam_pileup_cd* cd );
163 
164 };
165 
166 // =================================================================================================
167 // Sam File Handle Members
168 // =================================================================================================
169 
170 // -------------------------------------------------------------------------
171 // init_
172 // -------------------------------------------------------------------------
173 
174 void SamVariantInputStream::SamFileHandle::init_( SamVariantInputStream const* parent )
175 {
176  // ----------------------------------
177  // General Setup
178  // ----------------------------------
179 
180  // Set the pointer to parent.
181  // We only create a handle if we have an actual file to process; that is, this function should
182  // not be called from Iterator when its a past-the-end default constructed Iterator that
183  // does not have any file.
184  parent_ = parent;
185  assert( ! parent_->input_file_.empty() );
186 
187  // Open the file and read its header.
188  // We use hts_open() here instead of sam_open(), as the latter is a macro whose expansion
189  // differs between htslib and samtools - WTF???
190  // In htslib, it simply forwards to hts_open() anyway, so let's just do that directly.
191  hts_file_ = hts_open( parent_->input_file_.c_str(), "r" );
192  if( ! hts_file_ ) {
193  throw std::runtime_error( "Cannot open file " + parent_->input_file_ );
194  }
195  sam_hdr_ = sam_hdr_read( hts_file_ );
196  if( ! sam_hdr_ ) {
197  throw std::runtime_error( "Cannot read header of file " + parent_->input_file_ );
198  }
199 
200  // Init the iterator, and set the max depth, to keep memory usage limited.
201  // Not sure that we also need to do the additional max depth check during plp porcessing below,
202  // but also doesn't hurt to keep it in there.
203  iter_ = bam_plp_init(
204  read_sam_,
205  static_cast<void*>( this )
206  );
207  if( ! iter_ ) {
208  throw std::runtime_error( "Cannot initialize to traverse file " + parent_->input_file_ );
209  }
210  if( parent_->max_acc_depth_ > 0 ) {
211  bam_plp_set_maxcnt( iter_, parent_->max_acc_depth_ );
212  }
213  // assert( hts_iter == nullptr );
214  // hts_iter = sam_itr_queryi(idx[i], HTS_IDX_START, 0, 0);
215 
216  // Once here, all is set up.
217  assert( parent_ );
218  assert( hts_file_ );
219  assert( sam_hdr_ );
220  assert( iter_ );
221 
222  // ----------------------------------
223  // RG Tag Setup
224  // ----------------------------------
225 
226  // If wanted, use the RG tags of the header to set the samples that we want to use,
227  // including any filtering if wanted, and taking care of the unaccounted setting as well.
228  // If not, we just init to one sample that catches all reads independently of their RG tag.
229 
230  // Prepare result. This is its default anyway, but why not.
231  rg_tags_.clear();
232  target_sample_names_.clear();
233  target_sample_count_ = 0;
234 
235  // If we do not want to split by RG tag, we simply leave rg_tag empty,
236  // and set the target count to 1, as in that case, we only ever produce one sample,
237  // containing the base counts of all reads, independently of their RG tag.
238  if( ! parent_->split_by_rg_ ) {
239 
240  // Some error checks. If we do not split, some settings shall not be set.
241  if(
242  ! parent_->rg_tag_filter_.empty() ||
243  parent_->inverse_rg_tag_filter_ ||
244  parent_->with_unaccounted_rg_
245  ) {
246  throw std::runtime_error(
247  "Input settings for filtering samples based on their RG tag are set in the "
248  "SAM/BAM/CRAM reader, but the RG tag splitting is not activated in the reader."
249  );
250  }
251 
252  // Set the target count to 1. We keep the target_sample_names_ empty in this case.
253  target_sample_count_ = 1;
254  return;
255  }
256 
257  // From here on, we know that we want to use the @RG read group tags from the header,
258  // and set the rg_tags_ to the samples (rg tags) that are there, including any potential
259  // filtering.
260 
261  // Get all RG read group tags from the header, and add the tags we want to the list.
262  // We check that the filter only uses tags that are actually in the header,
263  // by making a copy of the filter tags, and removing the ones that are in the header.
264  // Anything that remains is an error.
265  auto tags = get_header_rg_tags_();
266  auto filter_tags_copy = parent->rg_tag_filter_;
267  for( auto& tag : tags ) {
268 
269  // Remove the tag from the filter tag list copy if present. This also works if the tag
270  // is not actually present in the set (erase just doesn't do anything in that case),
271  // which of course it might be, as not all tags will be in the filter (otherwise,
272  // we'd not filter at all, or filter everything).
273  // We do this here before the tag has been moved from below.
274  filter_tags_copy.erase( tag );
275 
276  // Check if we want to include this rg tag. If not, we indicate that by setting
277  // the index to max, so that the function where samples are assessed can ignore them.
278  // This also makes sure that the rg tag filter setting is independent of the unaccounted
279  // filter, as we still will have listed all proper sample names from the header in the
280  // rg_tags_ list, so that when iterating, we can distinguish between tags that appear
281  // in the header but are filtered out, and those that are actually unaccounted for or
282  // are lacking the RG tag altogether.
283  if(
284  ( parent_->rg_tag_filter_.empty() ) ||
285  ( ! parent_->inverse_rg_tag_filter_ && parent_->rg_tag_filter_.count( tag ) > 0 ) ||
286  ( parent_->inverse_rg_tag_filter_ && parent_->rg_tag_filter_.count( tag ) == 0 )
287  ) {
288  // We set the index to the current target count, which only counts the valid samples
289  // that we do not want to filter out, meaning that each entry gets the index according
290  // to the how many-th valid element in the result map it is. Also add it to the list
291  // of names. We can now move it, as the tag is not needed any more.
292  rg_tags_.emplace( tag, target_sample_count_ );
293  target_sample_names_.emplace_back( std::move( tag ));
294  ++target_sample_count_;
295 
296  } else {
297  // RG tags that we want to filter out get assigned max int,
298  // as an indicator that the reading step shall skip them.
299  rg_tags_.emplace( std::move( tag ), std::numeric_limits<size_t>::max() );
300  }
301  }
302 
303  // Make sure that all given filters are actually valid according to the header.
304  // If any name remains in the list of copies, we have an error.
305  if( filter_tags_copy.size() > 0 ) {
306  // Nice output of some tags in the header. We need to fetch the tags from the header again,
307  // as we moved from the `tags` list in the loop above. But this is the error case,
308  // so the cost for this is fine.
309  tags = get_header_rg_tags_();
310  std::string hd_tags_msg;
311  if( tags.empty() ) {
312  hd_tags_msg = " Header does not contain any RG tags; there can hence be no filtering.";
313  } else {
314  hd_tags_msg = " First @RG tag that appears in the header: \"" + (*tags.begin()) + "\".";
315  }
316 
317  // Error. The filter tags are not empty, as we checked in the condition above.
318  assert( ! filter_tags_copy.empty() );
319  throw std::runtime_error(
320  "Invalid list of @RG read group tags given for filtering the SAM/BAM/CRAM file, "
321  "which do not occur in the @RG list in the header of the file." + hd_tags_msg +
322  " First offending RG tag that appears in the given filter list, "
323  "but not in the header: \"" + (*filter_tags_copy.begin()) + "\"."
324  );
325  }
326 
327  // After the above, we have added all tags to our map. We moved the strings, but the `tags`
328  // vector size is still the same, so let's assert this.
329  assert( rg_tags_.size() == tags.size() );
330  assert( parent_->split_by_rg_ );
331 
332  // If we use the unaccounted group, we need to make room for that additional sample.
333  if( parent_->with_unaccounted_rg_ ) {
334  target_sample_names_.emplace_back( "unaccounted" );
335  ++target_sample_count_;
336  }
337  assert( target_sample_names_.size() == target_sample_count_ );
338 
339  // Finally, set the callback functions that take care of finding the RG group per read.
340  // By only determining the RG tag once per read and storing it in the cliend data of the
341  // pileup, we have a tremendous speedup compared to our previous implementation where
342  // we determiend the RG tag for every base of every read again and again.
343  // See https://github.com/samtools/htslib/issues/1417#issuecomment-1088398131 for details.
344  // Functionality adapted from
345  // https://github.com/samtools/samtools/blob/4be69864304f4e7ac0113fdff9e4ff764b9d9267/bam_plcmd.c#L554-L557
346  // https://github.com/samtools/samtools/blob/4be69864304f4e7ac0113fdff9e4ff764b9d9267/bam_plcmd.c#L336-L351
347  // https://github.com/samtools/samtools/blob/4be69864304f4e7ac0113fdff9e4ff764b9d9267/bam_plcmd.c#L76
348  // as mentioned in the GitHub issue (but we use permalinks here, for future stability).
349  // We currently do not use the destructor, as we have nothing to free.
350  bam_plp_constructor( iter_, pileup_cd_create_ );
351  // bam_plp_destructor( iter_, pileup_cd_destroy_ );
352 }
353 
354 // -------------------------------------------------------------------------
355 // get_header_rg_tags_
356 // -------------------------------------------------------------------------
357 
358 std::vector<std::string> SamVariantInputStream::SamFileHandle::get_header_rg_tags_() const
359 {
360  // Inspired by https://github.com/samtools/samtools/blob/2ece68ef9d0bd302a74c952b55df1badf9e61aae/bam_split.c#L217
361 
362  // Prepare result.
363  assert( parent_ );
364  assert( sam_hdr_ );
365  std::vector<std::string> result;
366 
367  // Get the number of RG tags in the header of the file.
368  auto const n_rg = sam_hdr_count_lines( sam_hdr_, "RG" );
369  if( n_rg < 0 ) {
370  throw std::runtime_error(
371  "Failed to get @RG ID tags in file " + parent_->input_file_ +
372  ". Cannot split by RG read group tags."
373  );
374  }
375 
376  // Go through all RG tags and store them.
377  kstring_t id_val = KS_INITIALIZE;
378  for( size_t i = 0; i < static_cast<size_t>( n_rg ); ++i ) {
379 
380  // Get the next tag.
381  if( sam_hdr_find_tag_pos( sam_hdr_, "RG", i, "ID", &id_val ) < 0 ) {
382  ks_free( &id_val );
383  throw std::runtime_error(
384  "Failed to get @RG ID tags in file " + parent_->input_file_
385  );
386  }
387 
388  // Get the name of this rg tag. Need to free it afterwards ourselves.
389  // We need to turn it into a string anyways, so let's do this here.
390  char* tag = ks_release( &id_val );
391  result.emplace_back( tag );
392  free( tag );
393  }
394 
395  return result;
396 }
397 
398 // -------------------------------------------------------------------------
399 // Destructor
400 // -------------------------------------------------------------------------
401 
402 SamVariantInputStream::SamFileHandle::~SamFileHandle()
403 {
404  // if( mpileup_ ) {
405  // bam_mplp_destroy( mpileup_ );
406  // mpileup_ = nullptr;
407  // }
408  // if( handle.hts_iter ) {
409  // // destroy( handle.hts_iter );
410  // // handle.hts_iter = nullptr;
411  // }
412 
413  if( iter_ ) {
414  bam_plp_destroy( iter_ );
415  iter_ = nullptr;
416  }
417  if( sam_hdr_ ) {
418  sam_hdr_destroy( sam_hdr_ );
419  sam_hdr_ = nullptr;
420  }
421  if( hts_file_ ) {
422  hts_close( hts_file_ );
423  hts_file_ = nullptr;
424  }
425 }
426 
427 // =================================================================================================
428 // Sam File Handle Static Members
429 // =================================================================================================
430 
431 // -------------------------------------------------------------------------
432 // read_sam_
433 // -------------------------------------------------------------------------
434 
435 // static
436 int SamVariantInputStream::SamFileHandle::read_sam_(
437  void* data, bam1_t* bam
438 ) {
439  // The function processes a single mapped read. Reads that make it through here are then
440  // used by htslib for pileup processing, and subsequencly by us in the increment_() function
441  // to build our Variant object from it. Inspired from bam2depth and bedcov programs.
442  // Comment from bam2depth: read level filters better go here to avoid pileup.
443 
444  // Data in fact is a pointer to our handle.
445  auto handle = static_cast<SamVariantInputStream::SamFileHandle*>( data );
446  assert( handle );
447  assert( handle->parent_ );
448  assert( handle->hts_file_ );
449  assert( handle->sam_hdr_ );
450 
451  // Loop until we find a read that we want to use.
452  int ret;
453  while( true ) {
454 
455  // Read the data. Not sure why we need two behaviours depending on iter, but this is how it
456  // is done in samtools bam2depth, just that we here use the sam instead of the bam functions.
457  // int ret = handle->hts_iter
458  // ? sam_itr_next( handle->hts_file_, handle->hts_iter, bam )
459  // : sam_read1( handle->hts_file_, handle->sam_hdr_, bam )
460  // ;
461 
462  // Get the read, and check result.
463  // According to htslib, 0 on success, -1 on EOF, and <-1 on error, so let's check this.
464  ret = sam_read1( handle->hts_file_, handle->sam_hdr_, bam );
465  if( ret == -1 ) {
466  break;
467  }
468  if( ret < -1 ) {
469  throw std::runtime_error( "Error reading file " + handle->parent_->input_file() );
470  }
471 
472  // Define flag shorthands for readability.
473  auto const& flags_in_all = handle->parent_->flags_include_all_;
474  auto const& flags_in_any = handle->parent_->flags_include_any_;
475  auto const& flags_ex_all = handle->parent_->flags_exclude_all_;
476  auto const& flags_ex_any = handle->parent_->flags_exclude_any_;
477 
478  // Check per-read flags. Skip reads that match one of the conditions.
479  if( flags_in_all && (( bam->core.flag & flags_in_all ) != flags_in_all )) {
480  // from sam_view.c
481  // keep if (FLAG & N) == N (all on)
482  // int flag_on;
483  // case 'f': settings.flag_on |= bam_str2flag(optarg); break;
484  // if ( (b->core.flag & settings->flag_on) != settings->flag_on )
485  // return 1; // skip read
486 
487  continue;
488  }
489  if( flags_in_any && (( bam->core.flag & flags_in_any ) == 0 )) {
490  // from sam_view.c
491  // keep if (FLAG & N) != 0 (any on)
492  // int flag_anyon;
493  // case LONGOPT('g'):
494  // settings.flag_anyon |= bam_str2flag(optarg); break;
495  // if (settings->flag_anyon && ((b->core.flag & settings->flag_anyon) == 0))
496  // return 1; // skip read
497 
498  continue;
499  }
500  if( flags_ex_all && (( bam->core.flag & flags_ex_all ) == flags_ex_all )) {
501  // from sam_view.c
502  // reject if (FLAG & N) == N (any off)
503  // NB I think the "any off" in the previous line is wrong in samtools,
504  // and should be "all off"?!
505  // But the usage of "off" is misleading... we are looking for on bits...
506  // int flag_alloff;
507  // case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
508  // if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
509  // return 1; // skip read
510 
511  continue;
512  }
513  if( flags_ex_any && (( bam->core.flag & flags_ex_any ) != 0 )) {
514  // from sam_view.c
515  // keep if (FLAG & N) == 0 (all off)
516  // int flag_off;
517  // case 'F': settings.flag_off |= bam_str2flag(optarg); break;
518  // if (b->core.flag & settings->flag_off)
519  // return 1; // skip read
520 
521  continue;
522  }
523 
524  // Check minimum mapping quality as well.
525  if( static_cast<int>( bam->core.qual ) < handle->parent_->min_map_qual_ ) {
526  continue;
527  }
528  break;
529  }
530  return ret;
531 }
532 
533 // -------------------------------------------------------------------------
534 // pileup_cd_create_
535 // -------------------------------------------------------------------------
536 
537 // static
538 int SamVariantInputStream::SamFileHandle::pileup_cd_create_(
539  void* data, bam1_t const* b, bam_pileup_cd* cd
540 ) {
541  // Data in fact is a pointer to our handle, same as for the read function above.
542  auto handle = static_cast<SamVariantInputStream::SamFileHandle*>( data );
543 
544  // Only called when splitting by read groups.
545  assert( handle );
546  assert( handle->parent_ );
547  assert( handle->parent_->split_by_rg_ );
548 
549  // Details inspired by
550  // https://github.com/samtools/samtools/blob/2ece68ef9d0bd302a74c952b55df1badf9e61aae/bam_split.c#L476
551 
552  // Look up the RG tag of the current read.
553  // We have two fail cases: the tag is not in the header, or there is no tag at all.
554  // Either way, we make our decision dependend on whether we want to use the unaccounted
555  // reads or not. We could further distinguish between the cases, and offer a choice
556  // to throw an exception if the header does not contain all tags of the reads...
557  // Maybe do that if needed later.
558 
559  // Init a lookup into our tag index list, with its end as an indicator of failure.
560  // It will stay at this unless we find a proper RG index.
561  auto tag_itr = handle->rg_tags_.end();
562 
563  // Get RG tag from read and look it up in hash to find its index.
564  // Not obvious from htslib documentatin at all, but let's hope that this
565  // is the correct way to do this...
566  uint8_t* tag = bam_aux_get( b, "RG" );
567  if( tag != nullptr ) {
568  // Unfortunately, it seems that we have to take the detour via the string
569  // value of the tag here, instead of htslib directly offering the index.
570  // Hence all the shenannigans with the unordered map of indices...
571  // Also, please don't ask me why the htslib function for this is called bam_aux2Z().
572  // I guess Z is their notation for a string, and a tag such as RG is an "auxiliary" thing,
573  // so this function means "tag 2 string", in a sense?!
574  char* rg = bam_aux2Z( tag );
575  tag_itr = handle->rg_tags_.find( std::string( rg ));
576 
577  // Potential future extension: RG tag in read is not in the header.
578  // That indicates that something is wrong with the file. For now, we just treat this
579  // as an unaccounted read.
580  // if( tag_itr == handle->rg_tags_.end() ) {
581  // throw std::runtime_error(
582  // "Invalid @RG tag in read that is not listed in the header. "
583  // "Offending tag: '" + std::string( rg ) + "' in file " +
584  // parent_->input_file_
585  // );
586  // }
587  }
588 
589  size_t smp_idx = 0;
590  if( tag_itr != handle->rg_tags_.end() ) {
591  // We found the RG tag index.
592  // Assert that it is within the bounds - if we also use unaccounted,
593  // the base counts contains an additional element which should never be found
594  // by the index lookup. If we are here, we also need to have at least one actual
595  // sample plus potentially the unaccounted one, otherwise we would not be here.
596  // A special case are samples ignored due to being filtered by sample_names;
597  // in that case, their index is set to max, to indicate to the caller of this function
598  // to ignore them.
599  smp_idx = tag_itr->second;
600  if( handle->parent_->with_unaccounted_rg_ ) {
601  assert(
602  smp_idx < handle->target_sample_count_ - 1 ||
603  smp_idx == std::numeric_limits<size_t>::max()
604  );
605  } else {
606  assert(
607  smp_idx < handle->target_sample_count_ ||
608  smp_idx == std::numeric_limits<size_t>::max()
609  );
610  }
611  } else {
612  // One of the two error cases occurred.
613  // Decide based on whether we want to use unaccounted reads or skip them.
614  if( handle->parent_->with_unaccounted_rg_ ) {
615  // If we are here, we have at least initialized the samples to have the
616  // unaccounted base counts object.
617  // Get the last base counts object, which is the unaccounted one.
618  assert( handle->target_sample_count_ > 0 );
619  smp_idx = handle->target_sample_count_ - 1;
620  } else {
621  // If we are here, we have an unaccounted read and want to skip it.
622  // Use max size_t to indicate that.
623  smp_idx = std::numeric_limits<size_t>::max();
624  }
625  }
626 
627  // Final check. Then, we convert max to -1 for storage, as the bam_pileup_cd::i field
628  // is signed. We do not want to use -1 internally here, in order to keep unsigned integer
629  // comparisons for the indices simple.
630  assert(
631  smp_idx < handle->target_sample_count_ ||
632  smp_idx == std::numeric_limits<size_t>::max()
633  );
634  if( smp_idx == std::numeric_limits<size_t>::max() ) {
635  cd->i = -1;
636  } else {
637  cd->i = smp_idx;
638  }
639 
640  return 0;
641 }
642 
643 // -------------------------------------------------------------------------
644 // pileup_cd_destroy_
645 // -------------------------------------------------------------------------
646 
647 // static
648 int SamVariantInputStream::SamFileHandle::pileup_cd_destroy_(
649  void* data, bam1_t const* b, bam_pileup_cd* cd
650 ) {
651  // Nothing to do, as we only use bam_pileup_cd::i, which does not need freeing.
652  // We keep this function around for reference, in case it's needed later.
653  (void) data;
654  (void) b;
655  (void) cd;
656  return 0;
657 }
658 
659 // =================================================================================================
660 // Iterator Public Members
661 // =================================================================================================
662 
663 // -------------------------------------------------------------------------
664 // Concstructor
665 // -------------------------------------------------------------------------
666 
667 SamVariantInputStream::Iterator::Iterator( SamVariantInputStream const* parent )
668  : parent_( parent )
669  , handle_( std::make_shared<SamFileHandle>() )
670 {
671  // The constructor needs to be here in the cpp, as we just defined SamFileHandle above.
672  // Otherwise, the init for the shared pointer to handle_ above cannot work.
673 
674  // Assert that the nucleotide codes in htslib are as we expect them here.
675  // This should be a static assertion, but C char* does not support that,
676  // so instead we do it once here on construction - an acceptable overhead.
677  assert( strcmp( seq_nt16_str, "=ACMGRSVTWYHKDBN" ) == 0 );
678 
679  // Edge case: empty iterator,
680  assert( parent_ );
681  assert( handle_ );
682  if( parent_->input_file_.empty() ) {
683  return;
684  }
685 
686  // Initialize the data structures of the handle. This sets up all htslib data strcutures,
687  // and if needed the RG tag filtering.
688  handle_->init_( parent_ );
689 
690  // Finally, get the first line.
691  increment_();
692 }
693 
694 // -------------------------------------------------------------------------
695 // rg_tags
696 // -------------------------------------------------------------------------
697 
698 std::vector<std::string> SamVariantInputStream::Iterator::rg_tags( bool all_header_tags ) const
699 {
700  // Previously, this function used to rely on a re-filling of the list of tags, using
701  // the settings of the parent_ SamVariantInputStream. However, in cases where the per-read
702  // filter settings would fitler out _all_ reads (e.g., super high min map qual filter),
703  // the iterator would already reach its end when being intitialized, as no read would ever be
704  // used, and hence parent_ would already be nullptr when this function here was called from
705  // the user. So no we do not rely on the parent_ any more, and instead store the list
706  // in the handle already upon construction.
707  // In the future, it might still be necessary to fix the underlying issue, i.e., the iterator
708  // being ended, and the parent being null, if the use case of this class changes. But for
709  // now, it seems that the approach still works, so we keep it that way.
710 
711  assert( handle_ );
712  if( all_header_tags ) {
713  // Special case, if we just want to get all the rg tags from the file header.
714  // This is a bit trippy in case that the iteration already finished... In that case,
715  // this iterator will already have it's parent_ pointer nulled, but the handle_ will not,
716  // so that we can use this to get the rg header tags again from the file.
717  // In hindsight, it might have been better to not use the parent_ == nullptr as an indicator
718  // that we have finished iteration... On the other hand, this is a pretty good way to
719  // ensure that no part of the iterator is called accidentally afterwards, as this will
720  // immediately lead to failure.
721  return handle_->get_header_rg_tags_();
722  }
723 
724  // Normal case: get the tags that are acutally used, potentially after splitting and filtering.
725  return handle_->target_sample_names_;
726 }
727 
728 // -------------------------------------------------------------------------
729 // sample_size
730 // -------------------------------------------------------------------------
731 
732 size_t SamVariantInputStream::Iterator::sample_size() const
733 {
734  assert( handle_ );
735  return handle_->target_sample_count_;
736 }
737 
738 // =================================================================================================
739 // Iterator Private Members
740 // =================================================================================================
741 
742 // -------------------------------------------------------------------------
743 // increment_
744 // -------------------------------------------------------------------------
745 
746 void SamVariantInputStream::Iterator::increment_()
747 {
748  // Only to be called when the iterator is still valid (not past-the-end).
749  assert( parent_ );
750  assert( handle_ );
751  assert( handle_->parent_ );
752  assert( handle_->sam_hdr_ );
753  assert( handle_->iter_ );
754 
755  // Find the next input position that we want to consider (it has data and fitting read depth,
756  // and is not filtered out due to the region list).
757  // tid is the chromosome name index, pos the position on the chromosome, and n is read depth ("coverage")
758  int tid, pos, n;
759  bam_pileup1_t const* plp;
760  while( true ) {
761  plp = bam_plp_auto( handle_->iter_, &tid, &pos, &n );
762 
763  // Check for end of the iteration and error.
764  // Unfortunately, we cannot check the internal iter->error here, as the iter type bam_plp_t
765  // is only defined in `sam.c`, and hence not accessible to us here. So instead, any errors
766  // will be printed to stderr by htslib, and we will instead just end our iteration here,
767  // not being able to know whether this is due to the end of the file or an error.
768  // Typical errors that we would want to catch here are that the reads or chromosomes are not
769  // sorted, in which case an iterator such as this one here does not work.
770  // Users will get the stderr output that hints at that though, which is something.
771  if( plp == nullptr ) {
772  parent_ = nullptr;
773  return;
774  }
775  if( tid < 0 || n < 0 ) {
776  continue;
777  }
778 
779  // Region filter, used only if one was provided.
780  if(
781  parent_->region_filter_ &&
782  ! parent_->region_filter_->is_covered(
783  std::string( handle_->sam_hdr_->target_name[tid] ),
784  static_cast<size_t>( pos ) + 1
785  )
786  ) {
787  continue;
788  }
789 
790  // Coverage / read depth check.
791  if( parent_->min_depth_ && n < parent_->min_depth_ ) {
792  continue;
793  }
794  if( parent_->max_depth_ && n > parent_->max_depth_ ) {
795  continue;
796  }
797  break;
798  }
799 
800  // htslib takes care of ordering along chromosomes already, so let's just assert this.
801  // If the position is not incrementing, that means we switched chromosome.
802  // We do not assert that chromosomes are in correct lexicographical order, as apparently
803  // that is not necessary for the correct functioning of htslib sam iteration.
804  // assert( std::string( handle_->sam_hdr_->target_name[tid] ) >= current_variant_.chromosome);
805  assert( pos >= 0 );
806  assert(
807  ( static_cast<size_t>( pos ) + 1 > current_variant_.position ) ||
808  ( std::string( handle_->sam_hdr_->target_name[tid] ) != current_variant_.chromosome )
809  );
810 
811  // Set current chromosome/locus, make 1-based for our case.
812  current_variant_.chromosome = std::string( handle_->sam_hdr_->target_name[tid] );
813  current_variant_.position = static_cast<size_t>( pos ) + 1;
814  current_variant_.reference_base = 'N';
815  current_variant_.alternative_base = 'N';
816  current_variant_.status.reset();
817 
818  // Resize to the number of samples, and reset the variant base count tallies for all samples.
819  // We fully resize here, in case that the current_variant_ has been moved from by the user,
820  // see https://stackoverflow.com/a/55054380/4184258. All other member variables of the Variant
821  // are in defined states after being moved from, so the above initializations are okay,
822  // but the vector might be empty or of the wrong size afterwards.
823  // This also resets all counts to 0, and the status to passing.
824  current_variant_.samples.resize( handle_->target_sample_count_ );
825  for( auto& sample : current_variant_.samples ) {
826  sample = SampleCounts();
827  }
828 
829  // Go through the read data at the current position and tally up their base counts.
830  // The loop goes through all bases that cover the position. We access the reads that these
831  // bases belong to, in order to get information on base quality and the actual nucleotide, etc.
832  // See https://github.com/samtools/samtools/blob/ae1f9d8809/bam_plcmd.c#L62
833  for( int i = 0; i < n; ++i ) {
834  bam_pileup1_t const* p = plp + i;
835  process_base_( p );
836  }
837 }
838 
839 // -------------------------------------------------------------------------
840 // process_base_
841 // -------------------------------------------------------------------------
842 
843 void SamVariantInputStream::Iterator::process_base_( bam_pileup1_t const* p )
844 {
845  assert( parent_ );
846  assert( handle_ );
847 
848  // Check per base quality. If it does not meet our threshold, we skip this base,
849  // and go to the next one (next iteration of the for loop from where this function is called).
850  int const qual = p->qpos < p->b->core.l_qseq
851  ? bam_get_qual(p->b)[p->qpos]
852  : 0
853  ;
854  if( qual < parent_->min_base_qual_ ) {
855  return;
856  }
857 
858  // Get the sample, according to the read tag if set, or, if not set,
859  // just get the one sample that we have initialized.
860  auto const smp_idx = get_sample_index_( p );
861  if( smp_idx == std::numeric_limits<size_t>::max() ) {
862  // If we are here, we have an unaccounted read or one of a sample that we want to filter out,
863  // and hence skip it.
864  return;
865  }
866  assert( current_variant_.samples.size() == handle_->target_sample_count_ );
867  assert( smp_idx < current_variant_.samples.size() );
868  assert( handle_->target_sample_count_ == current_variant_.samples.size() );
869  auto& sample = current_variant_.samples[smp_idx];
870 
871  // Check deletions. If it is one, note that, and then we are done for this base.
872  if( p->is_del || p->is_refskip ){
873  ++sample.d_count;
874  return;
875  }
876 
877  // Get the htslib internal code for the nucleotide as defined in seq_nt16_str, in 0-15,
878  // which is what bam_seqi() returns, and use it to tally up (increment) our base counts.
879  uint8_t* seq = bam_get_seq (p->b );
880  uint8_t nuc = bam_seqi( seq, p->qpos );
881  switch( nuc ) {
882  case 1: {
883  ++sample.a_count;
884  break;
885  }
886  case 2: {
887  ++sample.c_count;
888  break;
889  }
890  case 4: {
891  ++sample.g_count;
892  break;
893  }
894  case 8: {
895  ++sample.t_count;
896  break;
897  }
898  case 15: {
899  ++sample.n_count;
900  break;
901  }
902  default: {
903  throw std::runtime_error(
904  "Invalid base in sam/bam/cram file " + parent_->input_file_ +
905  " at " + current_variant_.chromosome + ":" +
906  std::to_string( current_variant_.position ) + ". Found " +
907  std::string( 1, seq_nt16_str[nuc] ) + ", but expected [ACGTN]."
908  );
909  }
910  }
911 }
912 
913 // -------------------------------------------------------------------------
914 // get_sample_index_
915 // -------------------------------------------------------------------------
916 
917 size_t SamVariantInputStream::Iterator::get_sample_index_( bam_pileup1_t const* p ) const
918 {
919  assert( parent_ );
920  assert( handle_ );
921 
922  // If we are not using splitting by read groups, we just return the index of the single sample
923  // that we are using for all reads.
924  if( ! parent_->split_by_rg_ ) {
925  return 0;
926  }
927 
928  // If we are here, we have read group splitting, so we have set the callbacks.
929  assert( parent_->split_by_rg_ );
930 
931  // Now get the bam client data, and translate it back to our format, where -1 is turned
932  // into max int instead, in order to work easier with unsigned int comparisons for the indexing.
933  auto const index = p->cd.i;
934  if( index < 0 ) {
935  assert( index == -1 );
936  return std::numeric_limits<size_t>::max();
937  } else {
938  assert( static_cast<size_t>( index ) < handle_->target_sample_count_ );
939  return static_cast<size_t>( index );
940  }
941 }
942 
943 // =================================================================================================
944 // Sam Variant Input Iterator
945 // =================================================================================================
946 
947 SamVariantInputStream::SamVariantInputStream(
948  std::string const& infile,
949  std::unordered_set<std::string> const& rg_tag_filter,
950  bool inverse_rg_tag_filter
951 ) {
952  // Set the file, including error checking.
953  input_file( infile );
954 
955  // Skip unmapepd and duplicates by default. We set the flags here in the cpp,
956  // so that our header file can stay free of htslib includes and constants.
957  // flags_ = ( BAM_FUNMAP | BAM_FDUP );
958 
959  // Set the rg tag filter
960  rg_tag_filter_ = rg_tag_filter;
961  inverse_rg_tag_filter_ = inverse_rg_tag_filter;
962 }
963 
964 // =================================================================================================
965 // Validity check
966 // =================================================================================================
967 
968 // Make sure that the bases are as we expect them.
969 // Nope, cannot check, because seq_nt16_str is not declared constexpr... :-(
970 // static_assert(
971 // seq_nt16_str[ 1] == 'A' &&
972 // seq_nt16_str[ 2] == 'C' &&
973 // seq_nt16_str[ 4] == 'G' &&
974 // seq_nt16_str[ 8] == 'T' &&
975 // seq_nt16_str[15] == 'N',
976 // "Definition of the nucleotide codes in seq_nt16_str[] differs between htslib and genesis. "
977 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
978 // );
979 
980 // Usage examples:
981 // mpileup command: https://github.com/samtools/samtools/blob/develop/bam_plcmd.c
982 // bedcov: https://github.com/samtools/samtools/blob/develop/bedcov.c
983 
984 } // namespace population
985 } // namespace genesis
986 
987 #endif // htslib guard
functions.hpp
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
sam_variant_input_stream.hpp
genesis::population::SamVariantInputStream::SamVariantInputStream
SamVariantInputStream()
Create a default instance, with no input. This is also the past-the-end iterator.
Definition: sam_variant_input_stream.hpp:347
bam_plp_t
struct bam_plp_s * bam_plp_t
Definition: sam_variant_input_stream.hpp:61