A toolkit for working with phylogenetic data.
v0.24.0
tree/formats/newick/writer.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 #include "genesis/tree/tree.hpp"
38 
43 
44 #include <cassert>
45 #include <deque>
46 #include <memory>
47 #include <ostream>
48 #include <sstream>
49 #include <stdexcept>
50 #include <vector>
51 
52 namespace genesis {
53 namespace tree {
54 
55 // =================================================================================================
56 // Writing
57 // =================================================================================================
58 
59 void NewickWriter::write( Tree const& tree, std::shared_ptr<utils::BaseOutputTarget> target ) const
60 {
61  write( tree_to_broker( tree ), target );
62 }
63 
64 void NewickWriter::write( TreeSet const& tree_set, std::shared_ptr<utils::BaseOutputTarget> target, bool with_names ) const
65 {
66  auto& os = target->ostream();
67  for( size_t i = 0; i < tree_set.size(); ++i ) {
68 
69  // Write the name if wanted. We here ignore that this makes the line longer,
70  // and so the first line might exceed line_length a bit more. This is just a convenience
71  // anyway.
72  if( with_names ) {
73  auto const& name = tree_set.name_at(i);
74  bool need_qmarks = force_quot_marks_;
75  need_qmarks |= ( std::string::npos != name.find_first_of( " :;()[],=" ));
76  need_qmarks |= ( write_tags_ && std::string::npos != name.find_first_of( "{}" ));
77 
78  if( need_qmarks ) {
79  os << quotation_mark_ << name << quotation_mark_;
80  } else {
81  os << name;
82  }
83  os << " = ";
84  }
85 
86  write( tree_to_broker( tree_set.at(i) ), target );
87  }
88 }
89 
90 std::string NewickWriter::to_string( Tree const& tree ) const
91 {
92  std::ostringstream oss;
93  write( tree, utils::to_stream( oss ));
94  return oss.str();
95 }
96 
97 // =================================================================================================
98 // Intermediate Functions
99 // =================================================================================================
100 
102 {
103  NewickBroker broker;
104  for( auto const& prepare_plugin : prepare_writing_plugins ) {
105  prepare_plugin( tree, broker );
106  }
107 
108  // store the depth from each node to the root. this is needed to assign levels of depth
109  // to the nodes for the broker.
110  auto depth = node_path_length_vector(tree);
111 
112  // now fill the broker with nodes via postorder traversal, so that the root is put on top last.
113  broker.clear();
114  for( auto it : postorder(tree) ) {
116  bn.depth = depth[ it.node().index() ];
117 
118  for( auto const& node_plugin : node_to_element_plugins ) {
119  node_plugin( it.node(), bn );
120  }
121  // only write edge data to the broker element if it is not the last iteration.
122  // the last iteration is the root, which usually does not have edge information in newick.
123  // caveat: for the root node, the edge will point to an arbitrary edge away from the root.
124  if( !it.is_last_iteration() ) {
125  for( auto const& edge_plugin : edge_to_element_plugins ) {
126  edge_plugin( it.edge(), bn );
127  }
128  }
129 
130  broker.push_top(bn);
131  }
132 
133  broker.assign_ranks();
134  for( auto const& finish_plugin : finish_writing_plugins ) {
135  finish_plugin( tree, broker );
136  }
137  return broker;
138 }
139 
140 void NewickWriter::write( NewickBroker const& broker, std::shared_ptr<utils::BaseOutputTarget> target ) const
141 {
142 
143  auto& os = target->ostream();
144 
145  // Assertion helpers: how many parenthesis were written?
146  size_t op = 0;
147  size_t cp = 0;
148 
149  // Iterate broker in reverse order, because Newick...
150  size_t prev_depth = 0;
151  size_t cur_length = 0;
152  for( long pos = broker.size() - 1; pos >= 0; --pos ) {
153  auto const& elem = broker[pos];
154  if( elem.depth < 0 ) {
155  throw std::runtime_error( "Invalid NewickBroker: Depth < 0." );
156  }
157 
158  // Opening parenthesis.
159  // We open as many as needed to get to the depth of the current element.
160  // They will all be closed when processing the respective parent elements.
161  assert( elem.depth >= 0 );
162  for( size_t i = prev_depth; i < static_cast<size_t>( elem.depth ); ++i ) {
163  os << "(";
164  cur_length += 1;
165  ++op;
166  }
167 
168  // Write the NewickBrokerElement to the stream.
169  cur_length += write_( broker[pos], os );
170 
171  // Stop if it is the root. Don't have to write parenthesis or commas after the root element.
172  if( pos == 0 ) {
173  continue;
174  }
175 
176  // Closing parenthesis or comma for next element.
177  // Even for "empty" elements (e.g., inner nodes with no names), this is called,
178  // which ensures correct nesting.
179  if( broker[ pos - 1 ].depth == elem.depth - 1 ) {
180  os << ")";
181  cur_length += 1;
182  ++cp;
183  } else {
184  os << ",";
185  cur_length += 1;
186  }
187  prev_depth = elem.depth;
188 
189  // Line length check.
190  if( line_length_ > 0 && cur_length >= line_length_ ) {
191  os << "\n";
192  cur_length = 0;
193  }
194  }
195 
196  // Have to have written as many opening as closing parenthesis.
197  // Use void casts to avoid compiler warnings in release mode.
198  assert( op == cp );
199  (void) op;
200  (void) cp;
201 
202  os << ";";
203  if( trailing_new_line_ ) {
204  os << "\n";
205  }
206 }
207 
208 // =================================================================================================
209 // Internal Functions
210 // =================================================================================================
211 
212 size_t NewickWriter::write_( NewickBrokerElement const& bn, std::ostream& os ) const
213 {
214  size_t length = 0;
215 
216  // Write name.
217  if( write_names_ ) {
218  // Find out whether we need to put this name in quotation marks.
219  // According to http://evolution.genetics.washington.edu/phylip/newicktree.html :
220  // "A name can be any string of printable characters except blanks, colons, semicolons,
221  // parentheses, and square brackets." Well, they forgot to mention commas here.
222  // But we knew before that Newick is not a good format anyway...
223  // Also, if write_tags_ is true, we also quote {}, as those are used for tags.
224  bool need_qmarks = force_quot_marks_;
225  need_qmarks |= ( std::string::npos != bn.name.find_first_of( " :;()[]," ));
226  need_qmarks |= ( write_tags_ && std::string::npos != bn.name.find_first_of( "{}" ));
227 
228  if( need_qmarks ) {
229  os << quotation_mark_ << bn.name << quotation_mark_;
230  length += 2 + bn.name.size();
231  } else {
232  os << bn.name;
233  length += bn.name.size();
234  }
235  }
236 
237  // Write values (":...")
238  if( write_values_ ) {
239  for( std::string const& v : bn.values ) {
240  os << ":" << v;
241  length += 1 + v.size();
242  }
243  }
244 
245  // Write comments ("[...]")
246  if( write_comments_ ) {
247  for( std::string const& c : bn.comments ) {
248  os << "[" << c << "]";
249  length += 1 + c.size();
250  }
251  }
252 
253  // Write tags ("{...}")
254  if( write_tags_ ) {
255  for( std::string const& t : bn.tags ) {
256  os << "{" << t << "}";
257  length += 1 + t.size();
258  }
259  }
260 
261  return length;
262 }
263 
264 // std::string NewickWriter::to_string_rec_( NewickBroker const& broker, size_t pos ) const
265 // {
266 // // Old, recursive, slow version. Not used any more.
267 //
268 // // check if it is a leaf, stop recursion if so.
269 // if (broker[pos].rank() == 0) {
270 // return element_to_string_(broker[pos]);
271 // }
272 //
273 // // recurse over all children of the current node. while doing so, build a stack of the resulting
274 // // substrings in reverse order. this is because newick stores the nodes kind of "backwards",
275 // // by starting at a leaf node instead of the root.
276 // std::deque<std::string> children;
277 // for (size_t i = pos + 1; i < broker.size() && broker[i].depth > broker[pos].depth; ++i) {
278 // // skip if not immediate children (those will be called in later recursion steps)
279 // if (broker[i].depth > broker[pos].depth + 1) {
280 // continue;
281 // }
282 //
283 // // do the recursion step for this child, add the result to a stack
284 // children.push_front(to_string_rec_(broker, i));
285 // }
286 //
287 // // build the string by iterating the stack
288 // std::ostringstream out;
289 // out << "(";
290 // for( size_t i = 0; i < children.size(); ++i ) {
291 // if (i>0) {
292 // out << ",";
293 // }
294 // out << children[i];
295 // }
296 // out << ")" << element_to_string_(broker[pos]);
297 // return out.str();
298 // }
299 
300 } // namespace tree
301 } // namespace genesis
std::vector< finish_writing_function > finish_writing_plugins
Collect all functions to be called after finishing the actual tree writing.
std::vector< std::string > values
Numerical values associated with the node, i.e. branch lengths.
Definition: element.hpp:122
Tree & at(size_t index)
Definition: tree_set.hpp:178
std::string const & name_at(size_t index) const
Definition: tree_set.hpp:138
std::shared_ptr< BaseOutputTarget > to_stream(std::ostream &target_stream, GzipCompressionLevel compression_level=GzipCompressionLevel::kNoCompression)
Obtain an output target for writing to a stream.
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node...
size_t size() const
Return the size of the TreeSet, i.e., the number of stored Trees.
Definition: tree_set.hpp:228
std::vector< node_to_element_function > node_to_element_plugins
Collect all functions to be called for each TreeNode in order to translate it to a Newick representat...
void push_top(NewickBrokerElement const &node)
Definition: broker.cpp:51
size_t size() const
Returns the size of the stack, i.e. the number of nodes stored in the broker.
Definition: broker.cpp:178
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string to_string(Tree const &tree) const
Shorthand to write a Tree to Newick format and return it is a string.
Provides some valuable additions to STD.
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
tree::TreeSet tree_set(SampleSet const &sample_set)
Return a TreeSet containing all the trees of the SampleSet.
Definition: sample_set.cpp:156
Stores a Newick tree in an intermediate format that can be further processed into a Tree...
Definition: broker.hpp:106
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
void write(Tree const &tree, std::shared_ptr< utils::BaseOutputTarget > target) const
Write a Tree to an output target, using the Newick format.
void assign_ranks() const
Iterate over the tree and assign ranks (= number of immediate children) to all nodes.
Definition: broker.cpp:85
std::vector< std::string > comments
Arbitrary strings that can be attached to a node, e.g. in Newick format via "[]". ...
Definition: element.hpp:132
std::vector< prepare_writing_function > prepare_writing_plugins
Collect all functions to be called before starting the actual tree writing.
std::vector< std::string > tags
Arbitrary strings that can be attached to a node, e.g. in Newick format via "{}". ...
Definition: element.hpp:127
NewickBroker tree_to_broker(Tree const &tree) const
Transform the information of the tree into a NewickBroker object.
void clear()
Deletes all nodes from the broker.
Definition: broker.cpp:46
Provides functions for accessing the file system.
Provides easy and fast logging functionality.
std::vector< edge_to_element_function > edge_to_element_plugins
Collect all functions to be called for each TreeEdge in order to translate it to a Newick representat...
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Header of Tree class.
Header of Tree distance methods.
std::string name
Name of the node.
Definition: element.hpp:114
Store the information for one element of a Newick tree.
Definition: element.hpp:60
long depth
Depth of the node in the tree, i.e. its distance from the root.
Definition: element.hpp:137