A toolkit for working with phylogenetic data.
v0.24.0
keyed_newick_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace tree {
40 
41 // =================================================================================================
42 // Static Member Initialization
43 // =================================================================================================
44 
45 std::vector<KeyedAttributeTreeNewickReaderPlugin::AttributeDescriptor>
46 KeyedAttributeTreeNewickReaderPlugin::nhx_attributes_ ={
47  { "AC", Target::kNode, "AC", "", false }, // sequence accession
48  { "Co", Target::kNode, "Co", "", false }, // collapse this node when drawing the tree (default is not to collapse)
49  { "D", Target::kNode, "D", "", false }, // duplication event
50  { "E", Target::kNode, "E", "", false }, // EC number
51  { "GN", Target::kNode, "GN", "", false }, // gene name
52  { "O", Target::kNode, "O", "", false }, // orthologous to this external node
53  { "S", Target::kNode, "S", "", false }, // species name
54  { "SO", Target::kNode, "SO", "", false }, // "super orthologous" (no duplications on paths)
55  { "T", Target::kNode, "T", "", false }, // taxonomy ID
56 
57  { "B", Target::kEdge, "B", "", false }, // confidence value for parent branch
58  { "L", Target::kEdge, "L", "", false }, // log likelihood value on parent branch
59  { "Sw", Target::kEdge, "Sw", "", false }, // placing a subtree on the parent branch of this node
60  // makes the tree significantly worse according to
61  // Kishino/Hasegawa test (or similar)
62 };
63 
64 // =================================================================================================
65 // Settings
66 // =================================================================================================
67 
69  std::string const& key,
70  Target target
71 ) {
72  keyed_attributes_.push_back({ key, target, key, "", false });
73  return *this;
74 }
75 
77  std::string const& source_key,
78  Target target,
79  std::string const& target_key
80 ) {
81  keyed_attributes_.push_back({ source_key, target, target_key, "", false });
82  return *this;
83 }
84 
86  std::string const& source_key,
87  Target target,
88  std::string const& target_key,
89  std::string const& default_value
90 ) {
91  keyed_attributes_.push_back({ source_key, target, target_key, default_value, true });
92  return *this;
93 }
94 
96  Target target
97 ) {
98  catch_all_attributes_.push_back({ "", target, "", "", true });
99  return *this;
100 }
101 
103 {
104  prefix( "&&NHX" );
105  separator( ":" );
106  assigner( "=" );
107  return *this;
108 }
109 
111 {
113  for( auto const& attrib : nhx_attributes_ ) {
114  keyed_attributes_.push_back( attrib );
115  }
116  return *this;
117 }
118 
120 {
121  prefix_ = "&";
122  separator_ = ",";
123  assigner_ = "=";
124  trim_ = true;
125 
126  keyed_attributes_.clear();
127  catch_all_attributes_.clear();
128 }
129 
130 // =================================================================================================
131 // Plugin Functions
132 // =================================================================================================
133 
135 {
136  // Speedup.
137  if( ! has_attributes_for_target_( Target::kNode ) ) {
138  return;
139  }
140 
141  // Prepare data.
142  auto data = get_data_( element );
143  auto& attributes = node.data<AttributeTreeNodeData>().attributes;
144 
145  // Process all attributes.
146  process_keyed_attributes_( data, attributes, Target::kNode );
147  process_catch_all_attributes_( data, attributes, Target::kNode );
148 }
149 
151 {
152  // Speedup.
153  if( ! has_attributes_for_target_( Target::kEdge ) ) {
154  return;
155  }
156 
157  // Prepare data.
158  auto data = get_data_( element );
159  auto& attributes = edge.data<AttributeTreeEdgeData>().attributes;
160 
161  // Process all attributes.
162  process_keyed_attributes_( data, attributes, Target::kEdge );
163  process_catch_all_attributes_( data, attributes, Target::kEdge );
164 }
165 
167 {
168  // Set node data creation function.
169  reader.create_node_data_plugin = []( TreeNode& node ){
170  node.reset_data( AttributeTreeNodeData::create() );
171  };
172 
173  // Set edge data creation function.
174  reader.create_edge_data_plugin = []( TreeEdge& edge ){
175  edge.reset_data( AttributeTreeEdgeData::create() );
176  };
177 
178  // Add node manipulation functions.
179  reader.element_to_node_plugins.push_back(
180  [&]( NewickBrokerElement const& element, TreeNode& node ) {
181  element_to_node( element, node );
182  }
183  );
184 
185  // Add edge manipulation functions.
186  reader.element_to_edge_plugins.push_back(
187  [&]( NewickBrokerElement const& element, TreeEdge& edge ) {
188  element_to_edge( element, edge );
189  }
190  );
191 }
192 
193 // =================================================================================================
194 // Internal Functions
195 // =================================================================================================
196 
197 bool KeyedAttributeTreeNewickReaderPlugin::has_attributes_for_target_( Target target ) const
198 {
199  // This function is used by the element processing functions in order to check whether they
200  // have any work to do. If there is no attribute that targets the Nodes or Edges, then
201  // we do not need to process the data at all for that target.
202  // Another way, that would give even more speedup, would be to process the data only once
203  // (call get_data_ once) instead of twice (for nodes and for edges separately), but this would
204  // meant that we need to introduce more involved plugin functions to the newick reader.
205  // Either we'd need an element_to_node_and_edge function, or some for of element preprocessing,
206  // that would store some state (the data in our case) while it is processed, and that can then
207  // be used by the element_to_... functions, instead of processing the data twice. This is
208  // however more complex, and involves a fragile state that is only valid during the processing
209  // of one element. So, for now, we live with the slight performance issue that we sometimes
210  // need to split the comment data twice...
211 
212  for( auto const& attrs : keyed_attributes_ ) {
213  if( attrs.target == target ) {
214  return true;
215  }
216  }
217  for( auto const& attrs : catch_all_attributes_ ) {
218  if( attrs.target == target ) {
219  return true;
220  }
221  }
222  return false;
223 }
224 
225 KeyedAttributeTreeNewickReaderPlugin::PairList KeyedAttributeTreeNewickReaderPlugin::get_data_(
226  NewickBrokerElement const& element
227 ) const {
228  auto result = PairList();
229 
230  // Process all comments and see whether they have the desireed prefix.
231  for( auto const& comment : element.comments ) {
232  if( ! utils::starts_with( comment, prefix_ ) ) {
233  continue;
234  }
235 
236  // Skip the prefix...
237  size_t pos = prefix_.size();
238  size_t last_pos = prefix_.size();
239 
240  // ... then split the rest of the comment into key value pairs.
241  while( pos < comment.length() ) {
242 
243  // Find the next occurence of the separator, limited to the end of the string.
244  pos = comment.find( separator_, last_pos );
245  if( pos == std::string::npos ) {
246  pos = comment.length();
247  }
248 
249  // We found a key value pair.
250  if( pos != last_pos ) {
251  // Get the pair. This is a copy. Could be avoided, but I am lazy today.
252  assert( pos > last_pos );
253  auto entry = std::string( comment.data() + last_pos, pos - last_pos );
254 
255  // Split it according to the assign string.
256  auto ass_pos = entry.find( assigner_ );
257 
258  // If there is an assign sign, use it to split.
259  // This also avoids to add empty data. For example, NHX starts with a separator
260  // after the prefix, so there is an empty field in the beginning. As this does not
261  // contain the assigner, it is skipped here.
262  if( ass_pos != std::string::npos ) {
263  if( trim_ ) {
264  result.emplace_back(
265  utils::trim( entry.substr( 0, ass_pos )),
266  utils::trim( entry.substr( ass_pos + 1 ))
267  );
268  } else {
269  result.emplace_back(
270  entry.substr( 0, ass_pos ),
271  entry.substr( ass_pos + 1 )
272  );
273  }
274  }
275  }
276 
277  last_pos = pos + 1;
278  }
279  }
280 
281  return result;
282 }
283 
284 void KeyedAttributeTreeNewickReaderPlugin::process_keyed_attributes_(
285  PairList const& data,
286  AttributeTreeMap& attributes,
287  Target target
288 ) const {
289  // Process indexed attributes...
290  for( auto const& attrs : keyed_attributes_ ) {
291 
292  // ... but only if they are for our target.
293  if( attrs.target != target ) {
294  continue;
295  }
296 
297  // Process all data, and add it if it fits the key.
298  bool found_key = false;
299  for( auto const& datum : data ) {
300  if( datum.first == attrs.source_key ) {
301  attributes[ attrs.target_key ] = datum.second;
302  found_key = true;
303  }
304  }
305 
306  // If we did not find the key in the data, but want to use default, set it.
307  if( ! found_key && attrs.use_default ) {
308  attributes[ attrs.target_key ] = attrs.default_value;
309  }
310  }
311 }
312 
313 void KeyedAttributeTreeNewickReaderPlugin::process_catch_all_attributes_(
314  PairList const& data,
315  AttributeTreeMap& attributes,
316  Target target
317 ) const {
318  // Process catch all attributes...
319  for( auto const& attrs : catch_all_attributes_ ) {
320 
321  // ... but only if they are for our target.
322  if( attrs.target != target ) {
323  continue;
324  }
325 
326  // Process all data and add the key value pairs to the target.
327  for( auto const& datum : data ) {
328  attributes[ datum.first ] = datum.second;
329  }
330  }
331 }
332 
333 } // namespace tree
334 } // namespace genesis
bool starts_with(std::string const &text, std::string const &start)
Return whether a string starts with another string.
Definition: string.cpp:79
Data class for AttributeTreeNodes.
Data class for AttributeTreeEdges.
Target
Select where to store the data, i.e., at Nodes or Edges of the Tree.
static std::unique_ptr< AttributeTreeEdgeData > create()
void element_to_edge(NewickBrokerElement const &element, TreeEdge &edge) const
self_type & add_catch_all(Target target=Target::kNode)
Store all key-value-pairs of the Newick data in an AttributeTree.
std::string assigner() const
Get the currently set assign symbol between a key and its value.
static std::unique_ptr< AttributeTreeNodeData > create()
Provide a set of plugin functions for NewickReader to read key-value-pair data attributes into an Att...
self_type & set_nhx_delimiters()
Set the delimiters to the format used by NHX.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string separator() const
Get the currently set separator between key-value-pairs.
Store data at the attributes map of an AttributeTreeNode.
self_type & add_nhx_attributes()
Add typical attributes of the NHX format, and set the appropriate delimiters.
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:394
std::vector< std::string > comments
Arbitrary strings that can be attached to a node, e.g. in Newick format via "[]". ...
Definition: element.hpp:132
create_edge_data_function create_edge_data_plugin
std::string prefix() const
Get the currently set prefix to look for in Newick comments.
Provides some commonly used string utility functions.
Store data at the attributes map of an AttributeTreeEdge.
void element_to_node(NewickBrokerElement const &element, TreeNode &node) const
std::vector< element_to_edge_function > element_to_edge_plugins
void clear()
Reset all settings to the default and delete all attribute settings.
self_type & add_attribute(std::string const &key, Target target)
Store values of a key at a target (i.e., Node or Edge).
create_node_data_function create_node_data_plugin
NodeDataType & data()
Definition: node.hpp:203
std::map< std::string, std::string > AttributeTreeMap
Alias for the map type used by an AttributeTree.
EdgeDataType & data()
Definition: edge.hpp:217
Store the information for one element of a Newick tree.
Definition: element.hpp:60
std::vector< element_to_node_function > element_to_node_plugins