|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
41 #include <unordered_set>
65 if( line.size() == 0 ) {
68 if( line[0].
length() != 3 || line[0][0] !=
'@' ) {
69 throw std::runtime_error(
71 " does not start with a header record type code '@XX', but with '" + line[0] +
"'."
76 assert( line.size() > 0 );
77 assert( line[0].size() == 3 );
78 assert( line[0][0] ==
'@' );
79 if( line[0] !=
"@SQ" ) {
86 for(
size_t i = 1; i < line.size(); ++i ) {
87 auto const& field = line[i];
88 if( field.size() < 3 || field[2] !=
':' ) {
89 throw std::runtime_error(
91 " contains an @SQ record that is not of the form 'TAG:VALUE', "
92 "but '" + field +
"'."
95 assert( field.size() >=3 );
96 assert( field[2] ==
':' );
100 sn = field.substr( 3 );
104 throw std::runtime_error(
105 "Invalid sequence dict file: Line " +
std::to_string( line_cnt ) +
106 " contains an @SQ record with a field for the sequence length LN"
107 " whose VALUE is not a number, but '" + field.substr( 3 ) +
"'."
117 if( sn.empty() || ln == 0 ) {
118 throw std::runtime_error(
119 "Invalid sequence dict file: Line " +
std::to_string( line_cnt ) +
120 " contains an @SQ record with no valid SN or LN fields."
125 result.
add( sn, ln );
145 if( line.size() == 0 ) {
148 if( line.size() != 5 ) {
149 throw std::runtime_error(
151 " has " +
std::to_string( line.size() ) +
" columns instead of the expected 5 columns."
156 assert( line.size() == 5 );
157 std::string
const sn = line[0];
159 throw std::runtime_error(
161 " contains a record with a LENGTH field that is not a number, but '" + line[1] +
"'."
169 if( sn.empty() || ln == 0 ) {
170 throw std::runtime_error(
172 " contains a record with invalid NAME or LENGTH fields."
177 result.
add( sn, ln );
187 for(
auto const& elem : input ) {
188 result.
add( elem.label(), elem.length() );
213 auto check_dict_empty_names_ = [](
SequenceDict const& dict ) {
214 for(
auto const& e : dict ) {
215 if( e.name.empty() ) {
216 throw std::invalid_argument(
"Invalid reference sequences with empty names." );
220 check_dict_empty_names_( lhs );
221 check_dict_empty_names_( rhs );
231 for(
auto const& e : lhs ) {
241 for(
auto const& e : rhs ) {
250 for(
auto const& e : lhs ) {
259 for(
auto const& e : lhs ) {
263 if( e.length != rhs.
get( e.name ).
length ) {
270 throw std::invalid_argument(
271 "Invalid ReferenceComparisonMode in compatible_references()"
285 for(
size_t i = 0; i < dict.
size(); ++i ) {
286 if( dict[i].name.empty() || set[i].label().
empty() ) {
289 if( match_first_word ) {
291 auto const s_set =
utils::split( set[i].label(),
"\t " );
292 assert( ! s_dct.empty() );
293 assert( ! s_set.empty() );
294 if( s_dct[0] != s_set[0] ) {
297 }
else if( dict[i].name != set[i].label() || dict[i].
length != set[i].
length() ) {
SequenceDict sequence_iterable_to_dict_(T const &input)
void add(Sequence const &sequence, bool also_look_up_first_word=true)
Add a Sequence to the dictionary.
@ kSharedOnly
Either reference set can contain sequences that are not in the other. Only the shared ones are used f...
bool contains(std::string const &name) const
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
@ kLeftSuperset
The left hand reference set is allowed to contain sequences that are not in the right hand side....
bool is_convertible_to_unsigned_integer(std::string const &str)
Return whether a string can be converted to unsigned integer.
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
bool verify(SequenceDict const &dict, SequenceSet const &set, bool match_first_word)
Verify that a SequenceDict fits a SequenceSet.
SequenceDict sequence_set_to_dict(SequenceSet const &set)
Get the sequence dict/index information of a given set of Sequences.
bool starts_with(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix.
SequenceDict reference_genome_to_dict(ReferenceGenome const &rg)
Get the sequence dict/index information of a given set of Sequences that are stored in a ReferenceGen...
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
std::string to_string(GenomeLocus const &locus)
Provides some commonly used string utility functions.
bool convert_to_unsigned_integer(std::string const &str, unsigned long long &result)
Convert a string to unsigned integer, store the result in result, and return whether the conversion a...
bool compatible_references(SequenceDict const &lhs, SequenceDict const &rhs, ReferenceComparisonMode mode)
Verify that a SequenceDict fits a SequenceSet.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Store a set of Sequences.
ReferenceComparisonMode
Chose how to deal with sub-/super-sets when comparing references.
@ kStrict
Both compared reference sets have to contain the exact same sequence names.
SequenceDict read_sequence_dict(std::shared_ptr< utils::BaseInputSource > source)
Read a .dict sequence dictionary file, describing, e.g., reference genome sequence properties.
Entry const & get(std::string const &name) const
SequenceDict read_sequence_fai(std::shared_ptr< utils::BaseInputSource > source)
Read a .fai sequence index file, describing, e.g., reference genome sequence properties.
Lookup of Sequences of a reference genome.
@ kRightSuperset
The right hand reference set is allowed to contain sequences that are not in the lefthand side....