A toolkit for working with phylogenetic data.
v0.24.0
placement/formats/newick_reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_PLACEMENT_FORMATS_NEWICK_READER_H_
2 #define GENESIS_PLACEMENT_FORMATS_NEWICK_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
34 #include <cassert>
35 #include <stdexcept>
36 #include <set>
37 
40 
43 
45 
46 namespace genesis {
47 namespace placement {
48 
49 // =================================================================================================
50 // Placement Tree Newick Reader Plugin
51 // =================================================================================================
52 
57 {
58 public:
59 
60  // -------------------------------------------------------------------------
61  // Constructor and Rule of Five
62  // -------------------------------------------------------------------------
63 
65  virtual ~PlacementTreeNewickReaderPlugin() = default;
66 
69 
72 
73  // -------------------------------------------------------------------------
74  // Plugin Functions
75  // -------------------------------------------------------------------------
76 
77  void element_to_edge( tree::NewickBrokerElement const& element, tree::TreeEdge& edge ) const
78  {
79  // Set defaults.
80  auto& edge_data = edge.data<PlacementEdgeData>();
81  edge_data.reset_edge_num( -1 );
82 
83  // Depending on the setting, we either use Newick tags {} or comments [] to get the edge nums.
84  auto const& en_vec = ( get_edge_num_from_comments_ ? element.comments : element.tags );
85 
86  // Process the edge num.
87  if( en_vec.size() == 1 ) {
88  edge_data.reset_edge_num( std::stoi( en_vec[0] ));
89 
90  } else {
91  // Error cases.
92  // Check which error occured and report accordingly.
93 
94  // Get a nice readable name.
95  auto name = element.name;
96  if( name.empty() ) {
97  name = "inner node";
98  } else {
99  name = "node '" + name + "'";
100  }
101 
102  // If there is no edge num value at all, we keep it at the default
103  // of -1, which will then be fixed later in finish_reading().
104  // We do not warn about this here, as this might just clutter the output too much
105  // if there are several edges with missing edge_nums.
106  // if( en_vec.size() == 0 ) {
107  // LOG_WARN << "Jplace document contains a Newick tree where the edge at " << name
108  // << " does not contain an edge_num value. "
109  // << "This might be because the document uses an old jplace standard "
110  // << "where edges at the root do not have an edge_num. "
111  // << "We can still work with this tree, but it might also indicate "
112  // << "a more severe issue with the data.";
113  // }
114 
115  // Cannot cope with multiple values, as we would not know which one is the correct
116  // one intended to be used as edge num.
117  if( en_vec.size() > 1 ) {
118  if( get_edge_num_from_comments_ ) {
119  throw std::invalid_argument(
120  "Edge at " + name + " contains more than one comment value such as "
121  "'[xyz]'. Expecting only one for the placement edge_num of this edge."
122  );
123  } else {
124  throw std::invalid_argument(
125  "Edge at " + name + " contains more than one tag value such as "
126  "'{xyz}'. Expecting only one for the placement edge_num of this edge."
127  );
128  }
129  }
130  }
131  }
132 
133  void finish_reading( tree::Tree& tree ) const
134  {
135  // Get a list of all used edge nums and check their uniqueness.
136  // This is a bit wasteful, as we later do a similar check in the JplaceReader,
137  // but we kind of need this here anyway to correctly fix missing edge nums.
138  std::set<PlacementEdgeData::EdgeNumType> edge_nums;
139  for( auto const& edge : tree.edges() ) {
140  auto const edge_num = edge.data<PlacementEdgeData>().edge_num();
141 
142  // Check for uniqueness. We leave out -1 here, just in case that there are multiple
143  // edges that did not get a proper edge num in the file. This will be fixed later anyway.
144  if( edge_num > -1 && edge_nums.count( edge_num ) > 0 ) {
145  throw std::invalid_argument(
146  "Jplace document contains a Newick tree where the edge_num '"
147  + std::to_string( edge_num ) + "' occurs more than once, "
148  "and hence cannot be used to uniquely identify edges of the tree. "
149  "This indicates a severe issue with the program that created the jplace file."
150  );
151  }
152  edge_nums.insert( edge_num );
153  }
154 
155  // Some more safety for the user.
156  if( tree.empty() ) {
157  throw std::invalid_argument( "Jplace document contains an empty Newick tree." );
158  }
159  if( tree.edge_count() >= 3 && edge_nums.size() < tree.edge_count() - 3 ) {
160  // We "allow" 3 edges without edge num before we warn. This can for example be
161  // edges at the root. While having edges without edge num seems to be a thing that only
162  // occurs with SEPP and the old jplace standard version 1, we still allow for this,
163  // just to be nice. But anything above this is highly suspicious (even more so than
164  // not having proper edge nums in the first place), as all these edges without edge nums
165  // are not identifiable and hence cannot receive any placements.
166  // So, let's stop being nice.
167  throw std::invalid_argument(
168  "Jplace document contains too many edges without an edge_num. We can cope with a few "
169  "of them missing. But as none of them can receive any placements, it does not make "
170  "sense if too many are missing. This hence indicates a severe issue with the program "
171  "that created the jplace file. Possibly, the provided jplace version (1-3) does not "
172  "match the format used to specify the edge_num values in the tree."
173  );
174  }
175 
176  // If there are edge nums that were not set by the element_to_edge() function,
177  // we assume that those are some weird edge cases such as an edge at the root.
178  // This seems to be allowed in jplace version 1, or at least seems to be created by SEPP.
179  // It is not a good thing, but we try to cope and fix this.
180  // As these edge cannot have any placements (because there is no way of referring to them
181  // from the pqueries), we simply set a dummy value here, in order for the rest of our code
182  // to have working edge nums.
183  if( edge_nums.count( -1 ) > 0 ) {
184  LOG_WARN << "Jplace document contains a Newick tree where not all edges have a proper "
185  << "edge_num assigned to them. This might be because the document uses "
186  << "an old jplace standard (version 1), where the edge at the root does not "
187  << "have an edge_num. We can still work with this tree, but it might also "
188  << "indicate a more severe issue with the data.";
189 
190  for( auto& edge : tree.edges() ) {
191  auto& edge_data = edge.data<PlacementEdgeData>();
192  if( edge_data.edge_num() == -1 ) {
193 
194  // Get the next bigger number that is not yet used as an edge num.
195  // There must be values in the edge_num set, as we checked at least for
196  // element -1 above.
197  // Use it for the edge, and make sure that it is added to the set for later
198  // iteration.
199  assert( edge_nums.size() > 0 );
200  auto const next_avail = *(edge_nums.rbegin()) + 1;
201  assert( edge_nums.count( next_avail ) == 0 );
202  edge_data.reset_edge_num( next_avail );
203  edge_nums.insert( next_avail );
204  }
205  }
206  }
207 
208  if( ! has_correct_edge_nums( tree )) {
209  LOG_WARN << "Jplace document has a Newick tree where the edge_num tags are non standard. "
210  << "They are expected by the jplace standard to be assigned in ascending order "
211  << "via post-order traversal of the tree. We can still work with this tree, "
212  << "but it might indicate an issue with the data.";
213  }
214  }
215 
216  void register_with( tree::NewickReader& reader ) const
217  {
218  // Set node data creation function.
219  reader.create_node_data_plugin = []( tree::TreeNode& node ){
220  node.reset_data( PlacementNodeData::create() );
221  };
222 
223  // Set edge data creation function.
224  reader.create_edge_data_plugin = []( tree::TreeEdge& edge ){
225  edge.reset_data( PlacementEdgeData::create() );
226  };
227 
228  // Add edge manipulation functions.
229  reader.element_to_edge_plugins.push_back(
230  [&]( tree::NewickBrokerElement const& element, tree::TreeEdge& edge ) {
231  element_to_edge( element, edge );
232  }
233  );
234 
235  // Add finish reading plugin.
236  reader.finish_reading_plugins.push_back(
237  [&]( tree::Tree& tree ) {
238  finish_reading( tree );
239  }
240  );
241  }
242 
243  // -------------------------------------------------------------------------
244  // Settings
245  // -------------------------------------------------------------------------
246 
256  {
257  get_edge_num_from_comments_ = value;
258  return *this;
259  }
260 
267  {
268  return get_edge_num_from_comments_;
269  }
270 
271  // -------------------------------------------------------------------------
272  // Member Data
273  // -------------------------------------------------------------------------
274 
275 private:
276 
277  bool get_edge_num_from_comments_ = false;
278 
279 };
280 
281 // =================================================================================================
282 // Placement Tree Newick Reader
283 // =================================================================================================
284 
286  : public tree::NewickReader
289 {
290 public:
291 
292  // -------------------------------------------------------------------------
293  // Constructor and Rule of Five
294  // -------------------------------------------------------------------------
295 
297  {
298  // Jplace files use tags. Activate them!
299  enable_tags( true );
300 
301  // We first register the default reader, then the placement reader, because the latter
302  // overwrites the data creation functions.
303  CommonTreeNewickReaderPlugin::register_with( *this );
305  }
306 };
307 
308 } // namespace placement
309 } // namespace genesis
310 
311 #endif // include guard
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
void reset_edge_num(EdgeNumType val)
Force to set the edge_num to a certain value.
PlacementTreeNewickReaderPlugin & get_edge_num_from_comments(bool value)
Set whether to use tags or comments of the Newick tree for the edge nums.
#define LOG_WARN
Log a warning. See genesis::utils::LoggingLevel.
Definition: logging.hpp:96
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
PlacementTreeNewickReaderPlugin & operator=(PlacementTreeNewickReaderPlugin const &)=default
void element_to_edge(tree::NewickBrokerElement const &element, tree::TreeEdge &edge) const
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
std::vector< std::string > comments
Arbitrary strings that can be attached to a node, e.g. in Newick format via "[]". ...
Definition: element.hpp:132
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
create_edge_data_function create_edge_data_plugin
std::vector< std::string > tags
Arbitrary strings that can be attached to a node, e.g. in Newick format via "{}". ...
Definition: element.hpp:127
std::vector< finish_reading_function > finish_reading_plugins
Provides easy and fast logging functionality.
static std::unique_ptr< PlacementEdgeData > create()
std::vector< element_to_edge_function > element_to_edge_plugins
create_node_data_function create_node_data_plugin
Provide a set of plugin functions for NewickReader to read a CommonTree.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
static std::unique_ptr< PlacementNodeData > create()
bool has_correct_edge_nums(PlacementTree const &tree)
Verify that the tree has correctly set edge nums.
bool get_edge_num_from_comments() const
Get whether to use tags or comments of the Newick tree for the edge nums.
std::string name
Name of the node.
Definition: element.hpp:114
EdgeDataType & data()
Definition: edge.hpp:217
Store the information for one element of a Newick tree.
Definition: element.hpp:60