A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
labels.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
38 
39 #include <algorithm>
40 
41 namespace genesis {
42 namespace sequence {
43 
44 // =================================================================================================
45 // General
46 // =================================================================================================
47 
48 Sequence const* find_sequence( SequenceSet const& set, std::string const& label )
49 {
50  for (Sequence const& s : set) {
51  if (s.label() == label) {
52  return &s;
53  }
54  }
55  return nullptr;
56 }
57 
58 std::unordered_set<std::string> labels( SequenceSet const& set )
59 {
60  std::unordered_set<std::string> result;
61  for( auto const& seq : set ) {
62  result.insert( seq.label() );
63  }
64  return result;
65 }
66 
67 // =================================================================================================
68 // Uniqueness
69 // =================================================================================================
70 
71 bool has_unique_labels( SequenceSet const& set, bool case_sensitive )
72 {
73  std::unordered_set< std::string > labels;
74  std::string label;
75 
76  for( auto const& seq : set ) {
77  if( case_sensitive ) {
78  label = seq.label();
79  } else {
80  label = utils::to_lower( seq.label() );
81  }
82 
83  if( labels.count( label ) > 0 ) {
84  return false;
85  } else {
86  labels.insert( label );
87  }
88  }
89  return true;
90 }
91 
92 void relabel_sha1( Sequence& seq )
93 {
94  auto digest = utils::SHA1::from_string_hex( seq.sites() );
95  seq.label( digest );
96 }
97 
99 {
100  for( auto& seq : set ) {
101  relabel_sha1( seq );
102  }
103 }
104 
105 // =================================================================================================
106 // Validity
107 // =================================================================================================
108 
109 bool is_valid_label( std::string const& label )
110 {
111  std::string invalid_chars = ":,();[]'";
112  for( auto c : label ) {
113  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
114  return false;
115  }
116  }
117  return true;
118 }
119 
120 bool has_valid_label( Sequence const& seq )
121 {
122  return is_valid_label( seq.label() );
123 }
124 
125 bool has_valid_labels( SequenceSet const& set )
126 {
127  for( auto const& seq : set ) {
128  if( ! has_valid_label( seq )) {
129  return false;
130  }
131  }
132  return true;
133 }
134 
135 std::string sanitize_label( std::string const& label )
136 {
137  std::string result;
138  result.reserve( label.size() );
139 
140  std::string const invalid_chars = ":,();[]'";
141  for( auto c : label ) {
142  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
143  result += "_";
144  } else {
145  result += c;
146  }
147  }
148  return result;
149 }
150 
152 {
153  seq.label( sanitize_label( seq.label() ));
154 }
155 
157 {
158  for( auto& seq : set ) {
159  sanitize_label( seq );
160  }
161 }
162 
163 // =================================================================================================
164 // Modifiers
165 // =================================================================================================
166 
168  SequenceSet& set,
169  std::unordered_set<std::string> const& labels,
170  bool invert
171 ) {
172  auto new_last = std::remove_if(
173  set.begin(),
174  set.end(),
175  [&] ( Sequence const& seq ) {
176  return ( !invert && labels.count( seq.label() ) > 0 ) ||
177  ( invert && labels.count( seq.label() ) == 0 );
178  }
179  );
180  set.remove( new_last , set.end() );
181 }
182 
183 } // namespace sequence
184 } // namespace genesis
void relabel_sha1(Sequence &seq)
Relabel the Sequence using the SHA1 hash digest of its sites.
Definition: labels.cpp:92
std::string const & sites() const
Definition: sequence.cpp:72
void filter_by_label_list(SequenceSet &set, std::unordered_set< std::string > const &labels, bool invert)
Remove all those Sequences from a SequenceSet whose labels are in the given list. ...
Definition: labels.cpp:167
bool has_unique_labels(SequenceSet const &set, bool case_sensitive)
Return true iff all labels of the Sequences in the SequenceSet are unique.
Definition: labels.cpp:71
std::string to_lower(std::string const &str)
Return an all-lowercase copy of the given string, locale-aware.
Definition: string.cpp:249
std::string const & label() const
Definition: sequence.cpp:44
bool has_valid_labels(SequenceSet const &set)
Check whether all Sequences in a SequenceSet have valid labels.
Definition: labels.cpp:125
void sanitize_labels(SequenceSet &set)
Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with unde...
Definition: labels.cpp:156
static std::string from_string_hex(std::string const &input)
Calculate the checksum for the content of a string.
Definition: sha1.cpp:166
Provides some commonly used string utility functions.
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
bool is_valid_label(std::string const &label)
Check whether a given string is a valid label for a Sequence.
Definition: labels.cpp:109
Store a set of Sequences.
std::string sanitize_label(std::string const &label)
Sanitize a label by replacing all invalid characters with underscores.
Definition: labels.cpp:135
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:58
bool has_valid_label(Sequence const &seq)
Check whether a Sequence has a valid label.
Definition: labels.cpp:120
Sequence const * find_sequence(SequenceSet const &set, std::string const &label)
Return a pointer to a Sequence with a specific label, or nullptr iff not found.
Definition: labels.cpp:48