A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
keyed_newick_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace tree {
40 
41 // =================================================================================================
42 // Static Member Initialization
43 // =================================================================================================
44 
45 std::vector<KeyedAttributeTreeNewickReaderPlugin::AttributeDescriptor>
46 KeyedAttributeTreeNewickReaderPlugin::nhx_attributes_ ={
47  { "AC", Target::kNode, "AC", "", false }, // sequence accession
48  { "Co", Target::kNode, "Co", "", false }, // collapse this node when drawing the tree (default is not to collapse)
49  { "D", Target::kNode, "D", "", false }, // duplication event
50  { "E", Target::kNode, "E", "", false }, // EC number
51  { "GN", Target::kNode, "GN", "", false }, // gene name
52  { "O", Target::kNode, "O", "", false }, // orthologous to this external node
53  { "S", Target::kNode, "S", "", false }, // species name
54  { "SO", Target::kNode, "SO", "", false }, // "super orthologous" (no duplications on paths)
55  { "T", Target::kNode, "T", "", false }, // taxonomy ID
56 
57  { "B", Target::kEdge, "B", "", false }, // confidence value for parent branch
58  { "L", Target::kEdge, "L", "", false }, // log likelihood value on parent branch
59  { "Sw", Target::kEdge, "Sw", "", false }, // placing a subtree on the parent branch of this node
60  // makes the tree significantly worse according to
61  // Kishino/Hasegawa test (or similar)
62 };
63 
64 // =================================================================================================
65 // Settings
66 // =================================================================================================
67 
69  std::string const& key,
70  Target target
71 ) {
72  keyed_attributes_.push_back({ key, target, key, "", false });
73  return *this;
74 }
75 
77  std::string const& source_key,
78  Target target,
79  std::string const& target_key
80 ) {
81  keyed_attributes_.push_back({ source_key, target, target_key, "", false });
82  return *this;
83 }
84 
86  std::string const& source_key,
87  Target target,
88  std::string const& target_key,
89  std::string const& default_value
90 ) {
91  keyed_attributes_.push_back({ source_key, target, target_key, default_value, true });
92  return *this;
93 }
94 
96  Target target
97 ) {
98  catch_all_attributes_.push_back({ "", target, "", "", true });
99  return *this;
100 }
101 
103 {
104  prefix( "&&NHX" );
105  separator( ":" );
106  assigner( "=" );
107  return *this;
108 }
109 
111 {
113  for( auto const& attrib : nhx_attributes_ ) {
114  keyed_attributes_.push_back( attrib );
115  }
116  return *this;
117 }
118 
120 {
121  prefix_ = "&";
122  separator_ = ",";
123  assigner_ = "=";
124  trim_ = true;
125 
126  keyed_attributes_.clear();
127  catch_all_attributes_.clear();
128 }
129 
130 // =================================================================================================
131 // Plugin Functions
132 // =================================================================================================
133 
135 {
136  // Speedup.
137  if( ! has_attributes_for_target_( Target::kNode ) ) {
138  return;
139  }
140 
141  // Prepare data.
142  auto data = get_data_( element );
143  auto& attributes = node.data<AttributeTreeNodeData>().attributes;
144 
145  // Process all attributes.
146  process_keyed_attributes_( data, attributes, Target::kNode );
147  process_catch_all_attributes_( data, attributes, Target::kNode );
148 }
149 
151 {
152  // Speedup.
153  if( ! has_attributes_for_target_( Target::kEdge ) ) {
154  return;
155  }
156 
157  // Prepare data.
158  auto data = get_data_( element );
159  auto& attributes = edge.data<AttributeTreeEdgeData>().attributes;
160 
161  // Process all attributes.
162  process_keyed_attributes_( data, attributes, Target::kEdge );
163  process_catch_all_attributes_( data, attributes, Target::kEdge );
164 }
165 
167 {
168  // Set node data creation function.
169  reader.create_node_data_plugin = []( TreeNode& node ){
170  node.reset_data( AttributeTreeNodeData::create() );
171  };
172 
173  // Set edge data creation function.
174  reader.create_edge_data_plugin = []( TreeEdge& edge ){
175  edge.reset_data( AttributeTreeEdgeData::create() );
176  };
177 
178  // Add node manipulation functions.
179  reader.element_to_node_plugins.push_back(
180  [&]( NewickBrokerElement const& element, TreeNode& node ) {
181  element_to_node( element, node );
182  }
183  );
184 
185  // Add edge manipulation functions.
186  reader.element_to_edge_plugins.push_back(
187  [&]( NewickBrokerElement const& element, TreeEdge& edge ) {
188  element_to_edge( element, edge );
189  }
190  );
191 }
192 
193 // =================================================================================================
194 // Internal Functions
195 // =================================================================================================
196 
197 bool KeyedAttributeTreeNewickReaderPlugin::has_attributes_for_target_( Target target ) const
198 {
199  // This function is used by the element processing functions in order to check whether they
200  // have any work to do. If there is no attribute that targets the Nodes or Edges, then
201  // we do not need to process the data at all for that target.
202  // Another way, that would give even more speedup, would be to process the data only once
203  // (call get_data_ once) instead of twice (for nodes and for edges separately), but this would
204  // meant that we need to introduce more involved plugin functions to the newick reader.
205  // Either we'd need an element_to_node_and_edge function, or some for of element preprocessing,
206  // that would store some state (the data in our case) while it is processed, and that can then
207  // be used by the element_to_... functions, instead of processing the data twice. This is
208  // however more complex, and involves a fragile state that is only valid during the processing
209  // of one element. So, for now, we live with the slight performance issue that we sometimes
210  // need to split the comment data twice...
211 
212  for( auto const& attrs : keyed_attributes_ ) {
213  if( attrs.target == target ) {
214  return true;
215  }
216  }
217  for( auto const& attrs : catch_all_attributes_ ) {
218  if( attrs.target == target ) {
219  return true;
220  }
221  }
222  return false;
223 }
224 
225 KeyedAttributeTreeNewickReaderPlugin::PairList KeyedAttributeTreeNewickReaderPlugin::get_data_(
226  NewickBrokerElement const& element
227 ) const {
228  auto result = PairList();
229 
230  // Process all comments and see whether they have the desireed prefix.
231  for( auto const& comment : element.comments ) {
232  if( ! utils::starts_with( comment, prefix_ ) ) {
233  continue;
234  }
235 
236  // Skip the prefix...
237  size_t pos = prefix_.size();
238  size_t last_pos = prefix_.size();
239 
240  // ... then split the rest of the comment into key value pairs.
241  while( pos < comment.length() ) {
242 
243  // Find the next occurence of the separator, limited to the end of the string.
244  pos = comment.find( separator_, last_pos );
245  if( pos == std::string::npos ) {
246  pos = comment.length();
247  }
248 
249  // We found a key value pair.
250  if( pos != last_pos ) {
251  // Get the pair. This is a copy. Could be avoided, but I am lazy today.
252  assert( pos > last_pos );
253  auto entry = std::string( comment.data() + last_pos, pos - last_pos );
254 
255  // Split it according to the assign string.
256  auto ass_pos = entry.find( assigner_ );
257 
258  // If there is an assign sign, use it to split.
259  // This also avoids to add empty data. For example, NHX starts with a separator
260  // after the prefix, so there is an empty field in the beginning. As this does not
261  // contain the assigner, it is skipped here.
262  if( ass_pos != std::string::npos ) {
263  if( trim_ ) {
264  result.emplace_back(
265  utils::trim( entry.substr( 0, ass_pos )),
266  utils::trim( entry.substr( ass_pos + 1 ))
267  );
268  } else {
269  result.emplace_back(
270  entry.substr( 0, ass_pos ),
271  entry.substr( ass_pos + 1 )
272  );
273  }
274  }
275  }
276 
277  last_pos = pos + 1;
278  }
279  }
280 
281  return result;
282 }
283 
284 void KeyedAttributeTreeNewickReaderPlugin::process_keyed_attributes_(
285  PairList const& data,
286  AttributeTreeMap& attributes,
287  Target target
288 ) const {
289  // Process indexed attributes...
290  for( auto const& attrs : keyed_attributes_ ) {
291 
292  // ... but only if they are for our target.
293  if( attrs.target != target ) {
294  continue;
295  }
296 
297  // Process all data, and add it if it fits the key.
298  bool found_key = false;
299  for( auto const& datum : data ) {
300  if( datum.first == attrs.source_key ) {
301  attributes[ attrs.target_key ] = datum.second;
302  found_key = true;
303  }
304  }
305 
306  // If we did not find the key in the data, but want to use default, set it.
307  if( ! found_key && attrs.use_default ) {
308  attributes[ attrs.target_key ] = attrs.default_value;
309  }
310  }
311 }
312 
313 void KeyedAttributeTreeNewickReaderPlugin::process_catch_all_attributes_(
314  PairList const& data,
315  AttributeTreeMap& attributes,
316  Target target
317 ) const {
318  // Process catch all attributes...
319  for( auto const& attrs : catch_all_attributes_ ) {
320 
321  // ... but only if they are for our target.
322  if( attrs.target != target ) {
323  continue;
324  }
325 
326  // Process all data and add the key value pairs to the target.
327  for( auto const& datum : data ) {
328  attributes[ datum.first ] = datum.second;
329  }
330  }
331 }
332 
333 } // namespace tree
334 } // namespace genesis
void element_to_edge(NewickBrokerElement const &element, TreeEdge &edge) const
bool starts_with(std::string const &text, std::string const &start)
Return whether a string starts with another string.
Definition: string.cpp:61
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:238
Data class for AttributeTreeNodes.
Data class for AttributeTreeEdges.
Target
Select where to store the data, i.e., at Nodes or Edges of the Tree.
static std::unique_ptr< AttributeTreeEdgeData > create()
self_type & add_catch_all(Target target=Target::kNode)
Store all key-value-pairs of the Newick data in an AttributeTree.
std::string separator() const
Get the currently set separator between key-value-pairs.
static std::unique_ptr< AttributeTreeNodeData > create()
Provide a set of plugin functions for NewickReader to read key-value-pair data attributes into an Att...
self_type & set_nhx_delimiters()
Set the delimiters to the format used by NHX.
std::string assigner() const
Get the currently set assign symbol between a key and its value.
Store data at the attributes map of an AttributeTreeNode.
self_type & add_nhx_attributes()
Add typical attributes of the NHX format, and set the appropriate delimiters.
create_edge_data_function create_edge_data_plugin
Provides some commonly used string utility functions.
Store data at the attributes map of an AttributeTreeEdge.
std::vector< element_to_edge_function > element_to_edge_plugins
void clear()
Reset all settings to the default and delete all attribute settings.
self_type & add_attribute(std::string const &key, Target target)
Store values of a key at a target (i.e., Node or Edge).
create_node_data_function create_node_data_plugin
NodeDataType & data()
Definition: node.hpp:108
std::map< std::string, std::string > AttributeTreeMap
Alias for the map type used by an AttributeTree.
void element_to_node(NewickBrokerElement const &element, TreeNode &node) const
EdgeDataType & data()
Definition: edge.hpp:118
std::string prefix() const
Get the currently set prefix to look for in Newick comments.
Store the information for one element of a Newick tree.
Definition: element.hpp:60
std::vector< element_to_node_function > element_to_node_plugins