A library for working with phylogenetic and population genetic data.
v0.32.0
function/genome_locus_set.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
38 
39 #include <cassert>
40 #include <cstring>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Conversion
48 // =================================================================================================
49 
51 {
53  for( auto const& elem : set ) {
54  // The elements are bitvectors that contain an extra entry for the 0th bit,
55  // which we hence need to subtract from the length here.
56  size_t len = elem.second.size() - 1;
57  if( elem.second.size() == 0 ) {
58  len = 0;
59  }
60  result.add( elem.first, len );
61  }
62  return result;
63 }
64 
65 // =================================================================================================
66 // Mask Fasta Reading
67 // =================================================================================================
68 
70  std::shared_ptr< utils::BaseInputSource > source,
71  size_t mask_min,
72  bool invert
73 ) {
74  GenomeLocusSet result;
75 
76  // Boundary checks.
77  if( mask_min > 9 ) {
78  throw std::invalid_argument(
79  "Fasta mask min value is " + std::to_string( mask_min ) + ", but has to be in [0-9]."
80  );
81  }
82 
83  // Read using a fasta input iterator
84  for( auto const& seq : sequence::FastaInputStream( source )) {
85  if( result.has_chromosome( seq.label() )) {
86  throw std::runtime_error(
87  "Duplicate sequence name \"" + seq.label() + "\" in " + source->source_name()
88  );
89  }
90 
91  // Make a bitvector of the correct size and fill it.
92  // We use 1-based positions in the GenomeLocusSet, so we have to adjust for that.
93  auto bv = utils::Bitvector( seq.length() + 1 );
94  for( size_t i = 0; i < seq.length(); ++i ) {
95  if( seq[i] < '0' || seq[i] > '9' ) {
96  throw std::runtime_error(
97  "Invalid mask code '" + std::string( 1, seq[i] ) + "' not in [0-9] "
98  "in sequence \"" + seq.label() + "\" in " + source->source_name()
99  );
100  }
101 
102  // Get the numerical value, and compare it to our min.
103  size_t const val = seq[i] - '0';
104  if( val > mask_min ) {
105  bv.set( i + 1 );
106  }
107  }
108 
109  // If we invert, we do that here at the end. We could also switch in the set function
110  // above, but it's easier to do this in bulk. We need to unset the first bit then.
111  if( invert ) {
112  bv.negate();
113  bv.unset(0);
114  }
115 
116  result.add( seq.label(), bv );
117  }
118 
119  return result;
120 }
121 
122 } // namespace population
123 } // namespace genesis
genesis::population::read_mask_fasta
GenomeLocusSet read_mask_fasta(std::shared_ptr< utils::BaseInputSource > source, size_t mask_min, bool invert)
Read an input source as a mask fasta file, and return its content as a GenomeLocusSet.
Definition: function/genome_locus_set.cpp:69
genesis::sequence::SequenceDict::add
void add(Sequence const &sequence, bool also_look_up_first_word=true)
Add a Sequence to the dictionary.
Definition: sequence_dict.hpp:133
fasta_reader.hpp
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::population::GenomeLocusSet::has_chromosome
bool has_chromosome(std::string const &chromosome) const
Return whether a chromosome is stored.
Definition: genome_locus_set.hpp:515
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::sequence::SequenceDict::size
size_t size() const
Definition: sequence_dict.hpp:218
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
fastx_input_stream.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
char.hpp
genesis::sequence::FastxInputStream
Stream through an input source and parse it as Fasta or Fastq sequences.
Definition: fastx_input_stream.hpp:55
genesis::population::reference_locus_set_to_dict
genesis::sequence::SequenceDict reference_locus_set_to_dict(GenomeLocusSet const &set)
Definition: function/genome_locus_set.cpp:50
bitvector.hpp
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127
genome_locus_set.hpp
sequence.hpp
genesis::utils::Bitvector
Definition: bitvector.hpp:50