A library for working with phylogenetic and population genetic data.
v0.32.0
tree/formats/newick/writer.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
37 #include "genesis/tree/tree.hpp"
38 
44 
45 #include <cassert>
46 #include <deque>
47 #include <memory>
48 #include <ostream>
49 #include <sstream>
50 #include <stdexcept>
51 #include <vector>
52 
53 namespace genesis {
54 namespace tree {
55 
56 // =================================================================================================
57 // Writing
58 // =================================================================================================
59 
60 void NewickWriter::write( Tree const& tree, std::shared_ptr<utils::BaseOutputTarget> target ) const
61 {
62  write( tree_to_broker( tree ), target );
63 }
64 
66  TreeSet const& tree_set,
67  std::shared_ptr<utils::BaseOutputTarget> target,
68  bool with_names
69 ) const {
70  auto& os = target->ostream();
71  for( size_t i = 0; i < tree_set.size(); ++i ) {
72 
73  // Write the name if wanted. We here ignore that this makes the line longer,
74  // and so the first line might exceed line_length a bit more. This is just a convenience
75  // anyway.
76  if( with_names ) {
77  auto const& name = tree_set.name_at(i);
78  bool need_qmarks = force_quot_marks_;
79  need_qmarks |= ( std::string::npos != name.find_first_of( " :;()[],=" ));
80  need_qmarks |= ( write_tags_ && std::string::npos != name.find_first_of( "{}" ));
81 
82  if( need_qmarks ) {
83  os << quotation_mark_ << name << quotation_mark_;
84  } else {
85  os << name;
86  }
87  os << " = ";
88  }
89 
90  write( tree_to_broker( tree_set.at(i) ), target );
91  }
92 }
93 
94 std::string NewickWriter::to_string( Tree const& tree ) const
95 {
96  std::ostringstream oss;
97  write( tree, utils::to_stream( oss ));
98  return oss.str();
99 }
100 
101 // =================================================================================================
102 // Intermediate Functions
103 // =================================================================================================
104 
106 {
107  NewickBroker broker;
108  for( auto const& prepare_plugin : prepare_writing_plugins ) {
109  prepare_plugin( tree, broker );
110  }
111 
112  // store the depth from each node to the root. this is needed to assign levels of depth
113  // to the nodes for the broker.
114  auto depth = node_path_length_vector(tree);
115 
116  // now fill the broker with nodes via postorder traversal, so that the root is put on top last.
117  broker.clear();
118  for( auto it : postorder(tree) ) {
120  bn.depth = depth[ it.node().index() ];
121 
122  for( auto const& node_plugin : node_to_element_plugins ) {
123  node_plugin( it.node(), bn );
124  }
125  // only write edge data to the broker element if it is not the last iteration.
126  // the last iteration is the root, which usually does not have edge information in newick.
127  // caveat: for the root node, the edge will point to an arbitrary edge away from the root.
128  if( !it.is_last_iteration() ) {
129  for( auto const& edge_plugin : edge_to_element_plugins ) {
130  edge_plugin( it.edge(), bn );
131  }
132  }
133 
134  broker.push_top(bn);
135  }
136 
137  broker.assign_ranks();
138  for( auto const& finish_plugin : finish_writing_plugins ) {
139  finish_plugin( tree, broker );
140  }
141  return broker;
142 }
143 
144 void NewickWriter::write( NewickBroker const& broker, std::shared_ptr<utils::BaseOutputTarget> target ) const
145 {
146  auto& os = target->ostream();
147 
148  // Assertion helpers: how many parenthesis were written?
149  size_t op = 0;
150  size_t cp = 0;
151 
152  // Iterate broker in reverse order, because Newick...
153  size_t prev_depth = 0;
154  size_t cur_length = 0;
155  for( size_t pos_rev = 0; pos_rev < broker.size(); ++pos_rev ) {
156  auto const pos = (broker.size() - 1) - pos_rev;
157  auto const& elem = broker[pos];
158  if( elem.depth < 0 ) {
159  throw std::runtime_error( "Invalid NewickBroker: Depth < 0." );
160  }
161 
162  // Opening parenthesis.
163  // We open as many as needed to get to the depth of the current element.
164  // They will all be closed when processing the respective parent elements.
165  assert( elem.depth >= 0 );
166  for( size_t i = prev_depth; i < static_cast<size_t>( elem.depth ); ++i ) {
167  os << "(";
168  cur_length += 1;
169  ++op;
170  }
171 
172  // Write the NewickBrokerElement to the stream.
173  cur_length += write_( broker[pos], os );
174 
175  // Stop if it is the root. Don't have to write parenthesis or commas after the root element.
176  if( pos == 0 ) {
177  continue;
178  }
179 
180  // Closing parenthesis or comma for next element.
181  // Even for "empty" elements (e.g., inner nodes with no names), this is called,
182  // which ensures correct nesting.
183  if( broker[ pos - 1 ].depth == elem.depth - 1 ) {
184  os << ")";
185  cur_length += 1;
186  ++cp;
187  } else {
188  os << ",";
189  cur_length += 1;
190  }
191  prev_depth = elem.depth;
192 
193  // Line length check.
194  if( line_length_ > 0 && cur_length >= line_length_ ) {
195  os << "\n";
196  cur_length = 0;
197  }
198  }
199 
200  // Have to have written as many opening as closing parenthesis.
201  // Use void casts to avoid compiler warnings in release mode.
202  assert( op == cp );
203  (void) op;
204  (void) cp;
205 
206  os << ";";
207  if( trailing_new_line_ ) {
208  os << "\n";
209  }
210 }
211 
212 // =================================================================================================
213 // Internal Functions
214 // =================================================================================================
215 
216 size_t NewickWriter::write_( NewickBrokerElement const& bn, std::ostream& os ) const
217 {
218  size_t length = 0;
219 
220  // Write name.
221  if( write_names_ ) {
222  // Find out whether we need to put this name in quotation marks.
223  // According to http://evolution.genetics.washington.edu/phylip/newicktree.html :
224  // "A name can be any string of printable characters except blanks, colons, semicolons,
225  // parentheses, and square brackets." Well, they forgot to mention commas here.
226  // But we knew before that Newick is not a good format anyway...
227  // Also, if write_tags_ is true, we also quote {}, as those are used for tags.
228  bool need_qmarks = force_quot_marks_;
229  need_qmarks |= ( std::string::npos != bn.name.find_first_of( " :;()[]," ));
230  need_qmarks |= ( write_tags_ && std::string::npos != bn.name.find_first_of( "{}" ));
231 
232  // Check for nonprintables.
233  for( auto const& c : bn.name ) {
234  if( ! utils::is_print(c) ) {
235  throw std::runtime_error(
236  "Error writing Newick file: non-printable character " + utils::char_to_hex( c ) +
237  "found in Newick node label."
238  );
239  }
240  }
241 
242  if( need_qmarks ) {
243  os << quotation_mark_ << bn.name << quotation_mark_;
244  length += 2 + bn.name.size();
245  } else {
246  os << bn.name;
247  length += bn.name.size();
248  }
249  }
250 
251  // Write values (":...")
252  if( write_values_ ) {
253  for( std::string const& v : bn.values ) {
254  os << ":" << v;
255  length += 1 + v.size();
256  }
257  }
258 
259  // Write comments ("[...]")
260  if( write_comments_ ) {
261  for( std::string const& c : bn.comments ) {
262  os << "[" << c << "]";
263  length += 1 + c.size();
264  }
265  }
266 
267  // Write tags ("{...}")
268  if( write_tags_ ) {
269  for( std::string const& t : bn.tags ) {
270  os << "{" << t << "}";
271  length += 1 + t.size();
272  }
273  }
274 
275  return length;
276 }
277 
278 // std::string NewickWriter::to_string_rec_( NewickBroker const& broker, size_t pos ) const
279 // {
280 // // Old, recursive, slow version. Not used any more.
281 //
282 // // check if it is a leaf, stop recursion if so.
283 // if (broker[pos].rank() == 0) {
284 // return element_to_string_(broker[pos]);
285 // }
286 //
287 // // recurse over all children of the current node. while doing so, build a stack of the resulting
288 // // substrings in reverse order. this is because newick stores the nodes kind of "backwards",
289 // // by starting at a leaf node instead of the root.
290 // std::deque<std::string> children;
291 // for (size_t i = pos + 1; i < broker.size() && broker[i].depth > broker[pos].depth; ++i) {
292 // // skip if not immediate children (those will be called in later recursion steps)
293 // if (broker[i].depth > broker[pos].depth + 1) {
294 // continue;
295 // }
296 //
297 // // do the recursion step for this child, add the result to a stack
298 // children.push_front(to_string_rec_(broker, i));
299 // }
300 //
301 // // build the string by iterating the stack
302 // std::ostringstream out;
303 // out << "(";
304 // for( size_t i = 0; i < children.size(); ++i ) {
305 // if (i>0) {
306 // out << ",";
307 // }
308 // out << children[i];
309 // }
310 // out << ")" << element_to_string_(broker[pos]);
311 // return out.str();
312 // }
313 
314 } // namespace tree
315 } // namespace genesis
broker.hpp
genesis::tree::NewickWriter::tree_to_broker
NewickBroker tree_to_broker(Tree const &tree) const
Transform the information of the tree into a NewickBroker object.
Definition: tree/formats/newick/writer.cpp:105
genesis::tree::NewickBrokerElement::name
std::string name
Name of the node.
Definition: element.hpp:114
fs.hpp
Provides functions for accessing the file system.
genesis::tree::NewickBrokerElement::comments
std::vector< std::string > comments
Arbitrary strings that can be attached to a node, e.g. in Newick format via "[]".
Definition: element.hpp:132
genesis::tree::NewickWriter::finish_writing_plugins
std::vector< finish_writing_function > finish_writing_plugins
Collect all functions to be called after finishing the actual tree writing.
Definition: tree/formats/newick/writer.hpp:210
genesis::tree::TreeSet::size
size_t size() const
Return the size of the TreeSet, i.e., the number of stored Trees.
Definition: tree_set.hpp:228
genesis::tree::NewickBroker::clear
void clear()
Deletes all nodes from the broker.
Definition: broker.cpp:46
output_stream.hpp
genesis::tree::NewickWriter::prepare_writing_plugins
std::vector< prepare_writing_function > prepare_writing_plugins
Collect all functions to be called before starting the actual tree writing.
Definition: tree/formats/newick/writer.hpp:205
genesis::placement::tree_set
tree::TreeSet tree_set(SampleSet const &sample_set)
Return a TreeSet containing all the trees of the SampleSet.
Definition: sample_set.cpp:156
tree.hpp
Header of Tree class.
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
genesis::tree::NewickWriter::write
void write(Tree const &tree, std::shared_ptr< utils::BaseOutputTarget > target) const
Write a Tree to an output target, using the Newick format.
Definition: tree/formats/newick/writer.cpp:60
genesis::tree::TreeSet::name_at
std::string const & name_at(size_t index) const
Definition: tree_set.hpp:138
tree_set.hpp
genesis::tree::TreeSet::at
Tree & at(size_t index)
Definition: tree_set.hpp:178
std.hpp
Provides some valuable additions to STD.
genesis::tree::NewickBroker::push_top
void push_top(NewickBrokerElement const &node)
Definition: broker.cpp:51
genesis::tree::NewickBrokerElement::values
std::vector< std::string > values
Numerical values associated with the node, i.e. branch lengths.
Definition: element.hpp:122
genesis::tree::TreeSet
Definition: tree_set.hpp:48
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
logging.hpp
Provides easy and fast logging functionality.
genesis::utils::is_print
constexpr bool is_print(char c) noexcept
Return whether a char is a printable character, according to isprint of the cctype header,...
Definition: char.hpp:209
genesis::tree::NewickWriter::edge_to_element_plugins
std::vector< edge_to_element_function > edge_to_element_plugins
Collect all functions to be called for each TreeEdge in order to translate it to a Newick representat...
Definition: tree/formats/newick/writer.hpp:222
genesis::tree::NewickBrokerElement::tags
std::vector< std::string > tags
Arbitrary strings that can be attached to a node, e.g. in Newick format via "{}".
Definition: element.hpp:127
genesis::tree::NewickWriter::to_string
std::string to_string(Tree const &tree) const
Shorthand to write a Tree to Newick format and return it is a string.
Definition: tree/formats/newick/writer.cpp:94
genesis::utils::to_stream
std::shared_ptr< BaseOutputTarget > to_stream(std::ostream &target_stream, GzipCompressionLevel compression_level=GzipCompressionLevel::kNoCompression)
Obtain an output target for writing to a stream.
Definition: output_target.hpp:203
writer.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::NewickBroker
Stores a Newick tree in an intermediate format that can be further processed into a Tree.
Definition: broker.hpp:106
char.hpp
genesis::tree::NewickBroker::assign_ranks
void assign_ranks() const
Iterate over the tree and assign ranks (= number of immediate children) to all nodes.
Definition: broker.cpp:85
genesis::tree::NewickBrokerElement::depth
long depth
Depth of the node in the tree, i.e. its distance from the root.
Definition: element.hpp:137
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
postorder.hpp
genesis::tree::NewickBrokerElement
Store the information for one element of a Newick tree.
Definition: element.hpp:60
distances.hpp
Header of Tree distance methods.
genesis::tree::node_path_length_vector
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node.
Definition: tree/function/distances.cpp:98
genesis::tree::NewickBroker::size
size_t size() const
Returns the size of the stack, i.e. the number of nodes stored in the broker.
Definition: broker.cpp:178
genesis::tree::postorder
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
Definition: tree/iterator/postorder.hpp:321
genesis::tree::NewickWriter::node_to_element_plugins
std::vector< node_to_element_function > node_to_element_plugins
Collect all functions to be called for each TreeNode in order to translate it to a Newick representat...
Definition: tree/formats/newick/writer.hpp:216