A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/helper.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <limits>
42 
43 #ifdef GENESIS_OPENMP
44 # include <omp.h>
45 #endif
46 
47 namespace genesis {
48 namespace placement {
49 
50 // =================================================================================================
51 // Helper Functions
52 // =================================================================================================
53 
54 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( PlacementTree const& tree )
55 {
56  auto en_map = std::unordered_map<int, PlacementTreeEdge*>();
57  for (
59  it != tree.end_edges();
60  ++it
61  ) {
62  auto const& edge = *it;
63  auto const& edge_data = edge->data<PlacementEdgeData>();
64  assert( en_map.count( edge_data.edge_num() ) == 0);
65  en_map.emplace( edge_data.edge_num(), edge.get());
66  }
67  return en_map;
68 }
69 
70 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( Sample const& smp )
71 {
72  return edge_num_to_edge_map( smp.tree() );
73 }
74 
75 std::vector<std::vector< Pquery const* >> pqueries_per_edge(
76  Sample const& sample,
77  bool only_max_lwr_placements
78 ) {
79  auto result = std::vector<std::vector< Pquery const* >>();
80  result.resize( sample.tree().edge_count() );
81 
82  for( auto const& pqry : sample.pqueries() ) {
83  if( only_max_lwr_placements ) {
84 
85  // If we are only interested in the most probably placement, find it first.
86  PqueryPlacement const* max_p = nullptr;
87  double max_v = std::numeric_limits<double>::lowest();
88  for( auto const& place : pqry.placements() ) {
89  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
90  max_v = place.like_weight_ratio;
91  max_p = &place;
92  }
93  }
94  // If there is one, add it to the list for its edge.
95  if( max_p ) {
96  result[ max_p->edge().index() ].push_back( &pqry );
97  }
98 
99  } else {
100  // If we instead want all placement, simply add them.
101  for( auto const& place : pqry.placements() ) {
102  result[ place.edge().index() ].push_back( &pqry );
103  }
104  }
105  }
106 
107  return result;
108 }
109 
110 std::vector< std::vector< PqueryPlacement const* >> placements_per_edge(
111  Sample const& smp,
112  bool only_max_lwr_placements
113 ) {
114  std::vector< std::vector< PqueryPlacement const* >> result;
115  result.resize( smp.tree().edge_count() );
116 
117  for( auto const& pqry : smp.pqueries() ) {
118  if( only_max_lwr_placements ) {
119 
120  // If we are only interested in the most probably placement, find it first.
121  PqueryPlacement const* max_p = nullptr;
122  double max_v = std::numeric_limits<double>::lowest();
123  for( auto const& place : pqry.placements() ) {
124  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
125  max_v = place.like_weight_ratio;
126  max_p = &place;
127  }
128  }
129  // If there is one, add it to the list for its edge.
130  if( max_p ) {
131  result[ max_p->edge().index() ].push_back( max_p );
132  }
133 
134  } else {
135  // If we instead want all placement, simply add them.
136  for( auto const& place : pqry.placements() ) {
137  result[ place.edge().index() ].push_back( &place );
138  }
139  }
140  }
141 
142  return result;
143 }
144 
145 std::vector<PqueryPlacement const*> placements_per_edge(
146  Sample const& smp,
147  PlacementTreeEdge const& edge
148 ) {
149  std::vector<PqueryPlacement const*> result;
150 
151  for( auto const& pqry : smp.pqueries() ) {
152  for( auto const& place : pqry.placements() ) {
153  if( &place.edge() == &edge ) {
154  result.push_back( &place );
155  }
156  }
157  }
158 
159  return result;
160 }
161 
162 std::vector<size_t> placement_count_per_edge( Sample const& sample )
163 {
164  auto result = std::vector<size_t>( sample.tree().edge_count(), 0 );
165 
166  for( auto const& pqry : sample.pqueries() ) {
167  for( auto const& place : pqry.placements() ) {
168  ++result[ place.edge().index() ];
169  }
170  }
171 
172  return result;
173 }
174 
176 {
177  // Basics.
178  auto const set_size = sample_set.size();
179  if( set_size == 0 ) {
180  return {};
181  }
182 
183  // Init matrix.
184  auto result = utils::Matrix<size_t>( set_size, sample_set[ 0 ].sample.tree().edge_count(), 0 );
185 
186  // Fill matrix.
187  #pragma omp parallel for
188  for( size_t i = 0; i < set_size; ++i ) {
189  for( auto const& pqry : sample_set[ i ].sample.pqueries() ) {
190  for( auto const& place : pqry.placements() ) {
191  ++result( i, place.edge().index() );
192  }
193  }
194  }
195 
196  return result;
197 }
198 
199 std::vector<double> placement_weight_per_edge( Sample const& sample )
200 {
201  auto result = std::vector<double>( sample.tree().edge_count(), 0.0 );
202 
203  for( auto const& pqry : sample.pqueries() ) {
204  for( auto const& place : pqry.placements() ) {
205  result[ place.edge().index() ] += place.like_weight_ratio;
206  }
207  }
208 
209  return result;
210 }
211 
213 {
214  // Init matrix.
215  auto const set_size = sample_set.size();
216  auto result = utils::Matrix<double>( set_size, sample_set[ 0 ].sample.tree().edge_count(), 0.0 );
217 
218  // Return completely empty matrix in edge cases.
219  if( result.rows() == 0 || result.cols() == 0 ) {
220  return {};
221  }
222 
223  // Fill matrix.
224  #pragma omp parallel for schedule(dynamic)
225  for( size_t i = 0; i < set_size; ++i ) {
226  auto const& smp = sample_set[ i ].sample;
227 
228  if( smp.tree().edge_count() != result.cols() ) {
229  throw std::runtime_error(
230  "Cannot calculate placement weights per edge matrix "
231  "for Samples with Trees of different size."
232  );
233  }
234 
235  for( auto const& pqry : smp.pqueries() ) {
236  for( auto const& place : pqry.placements() ) {
237  result( i, place.edge().index() ) += place.like_weight_ratio;
238  }
239  }
240  }
241 
242  return result;
243 }
244 
245 std::vector<PqueryPlain> plain_queries( Sample const & smp )
246 {
247  auto pqueries = std::vector<PqueryPlain>( smp.size() );
248 
249  #pragma omp parallel for
250  for (size_t i = 0; i < smp.size(); ++i) {
251  pqueries[i].index = i;
252 
253  const auto& opqry = smp.at(i);
254  pqueries[i].placements = std::vector<PqueryPlacementPlain>(opqry.placement_size());
255 
256  for (size_t j = 0; j < opqry.placement_size(); ++j) {
257  auto const& oplace = opqry.placement_at(j);
258  auto& place = pqueries[i].placements[j];
259 
260  place.edge_index = oplace.edge().index();
261  place.primary_node_index = oplace.edge().primary_node().index();
262  place.secondary_node_index = oplace.edge().secondary_node().index();
263 
264  auto const& oplace_data = oplace.edge().data<PlacementEdgeData>();
265  place.branch_length = oplace_data.branch_length;
266  place.pendant_length = oplace.pendant_length;
267  place.proximal_length = oplace.proximal_length;
268  place.like_weight_ratio = oplace.like_weight_ratio;
269  }
270  }
271  return pqueries;
272 }
273 
274 // =================================================================================================
275 // Verification
276 // =================================================================================================
277 
278 void rectify_values( Sample& sample )
279 {
280  for( auto& pqry : sample.pqueries() ) {
281 
282  // Rectify placement values
283  double lwr_sum = 0.0;
284  for( auto& p : pqry.placements() ) {
285 
286  // LWR
287  if( p.like_weight_ratio < 0.0 ) {
288  p.like_weight_ratio = 0.0;
289  }
290  if( p.like_weight_ratio > 1.0) {
291  p.like_weight_ratio = 1.0;
292  }
293  lwr_sum += p.like_weight_ratio;
294 
295  // Pendant length
296  if( p.pendant_length < 0.0 ) {
297  p.pendant_length = 0.0;
298  }
299 
300  // Proximal length
301  if( p.proximal_length < 0.0 ) {
302  p.proximal_length = 0.0;
303  }
304  if( p.proximal_length > p.edge().data<PlacementEdgeData>().branch_length ) {
305  p.proximal_length = p.edge().data<PlacementEdgeData>().branch_length;
306  }
307  }
308 
309  // If the total sum of LWRs is too big, rectify it.
310  if( lwr_sum > 1.0 ) {
311  normalize_weight_ratios( pqry );
312  }
313 
314  // Rectify name values
315  for( auto& n : pqry.names() ) {
316  // Negative multiplcities are invalid. As we don't know the actual value,
317  // better play it save and set it so 0, so that it is ignored,
318  // rather than setting it to 1, which would mean to fully use this name.
319  if( n.multiplicity < 0.0 ) {
320  n.multiplicity = 0.0;
321  }
322  }
323  }
324 }
325 
327 {
328  for( auto& smp : sset ) {
329  rectify_values( smp.sample );
330  }
331 }
332 
333 
335 {
336  // Edge numbers need to be in ascending order via postorder traversal.
337  int current = 0;
338  for( auto it : postorder(tree) ) {
339  // The last iteration is skipped, as the root does not have an edge.
340  if (it.is_last_iteration()) {
341  continue;
342  }
343 
344  it.edge().data<PlacementEdgeData>().reset_edge_num( current );
345  ++current;
346  }
347 }
348 
350 {
351  // List all edge nums.
352  auto order = std::vector<int>();
353  for( auto const& edge : tree.edges() ) {
354  order.push_back( edge->data<PlacementEdgeData>().edge_num() );
355  }
356 
357  // Sort them and use unique to see wether there are duplicates.
358  std::sort( order.begin(), order.end() );
359  if( std::unique( order.begin(), order.end() ) != order.end() ) {
360  return false;
361  }
362 
363  // Now check whether they are consecutive and start at 0.
364  if( order.empty() ) {
365  return true;
366  } else {
367  assert( order.size() >= 1 );
368  return order.front() == 0 && order.back() == static_cast<int>( order.size() - 1 );
369  }
370 }
371 
373 {
374  int current = 0;
375 
376  // Edge numbers need to be in ascending order via postorder traversal. Check this.
377  for( auto it : postorder(tree) ) {
378  // The last iteration is skipped, as the root does not have an edge.
379  if (it.is_last_iteration()) {
380  continue;
381  }
382 
383  if( it.edge().data<PlacementEdgeData>().edge_num() != current) {
384  return false;
385  }
386  ++current;
387  }
388 
389  return true;
390 }
391 
392 bool validate( Sample const& smp, bool check_values, bool break_on_values )
393 {
394  // check tree
395  if( ! tree::validate_topology( smp.tree() ) ) {
396  LOG_INFO << "Invalid placement tree topology.";
397  return false;
398  }
399  if( ! tree::tree_data_is< PlacementNodeData, PlacementEdgeData >( smp.tree() )) {
400  LOG_INFO << "Tree does not only contain Placement Node and Edge data types.";
401  return false;
402  }
403 
404  // check edges
405  std::unordered_map<int, PlacementTreeEdge*> edge_num_map;
406  for (
407  auto it_e = smp.tree().begin_edges();
408  it_e != smp.tree().end_edges();
409  ++it_e
410  ) {
411  // make sure every edge num is used once only
412  PlacementTreeEdge& edge = *(*it_e).get();
413  if( edge_num_map.count( edge.data<PlacementEdgeData>().edge_num() ) > 0 ) {
414  LOG_INFO << "More than one edge has edge_num '"
415  << edge.data<PlacementEdgeData>().edge_num() << "'.";
416  return false;
417  }
418  edge_num_map.emplace(
419  edge.data<PlacementEdgeData>().edge_num(), (*it_e).get()
420  );
421  }
422  if( ! has_correct_edge_nums( smp.tree() )) {
423  LOG_INFO << "Tree does not have correct edge nums.";
424  return false;
425  }
426 
427  // check pqueries
428  size_t pqry_place_count = 0;
429  for( auto const& pqry : smp.pqueries() ) {
430  // use this name for reporting invalid placements.
431  std::string name;
432  if (pqry.name_size() > 0) {
433  name = "'" + pqry.name_at(0).name + "'";
434  } else {
435  name = "(unnamed pquery)";
436  }
437 
438  // check placements
439  if (check_values && pqry.placement_size() == 0) {
440  LOG_INFO << "Pquery without any placements at '" << name << "'.";
441  if (break_on_values) {
442  return false;
443  }
444  }
445  double ratio_sum = 0.0;
446  for( auto pit = pqry.begin_placements(); pit != pqry.end_placements(); ++pit ) {
447  auto const& p = *pit;
448  auto const& edge_data = p.edge().data<PlacementEdgeData>();
449 
450  // Check if the placement has a valid pointer to its edge. This is a bit hard to do,
451  // as we use a raw pointer, so there is no easy way of telling whether it is valid
452  // or dangling. Instead, we simply check if the object behind it has the correct
453  // properties.
454  if(
455  p.edge().index() >= smp.tree().edge_count() ||
456  edge_num_map.count( edge_data.edge_num() ) == 0 ||
457  edge_data.edge_num() != p.edge_num()
458  ) {
459  LOG_INFO << "Invlaid edge pointer or edge num.";
460  return false;
461  }
462 
463  // now we know that all references between placements and edges are correct, so this
464  // assertion breaks only if we forgot to check some sort of weird inconsistency.
465  assert( edge_num_map.count( p.edge_num() ) > 0 );
466  ++pqry_place_count;
467 
468  // check numerical values
469  if (!check_values) {
470  continue;
471  }
472  if (p.like_weight_ratio < 0.0 || p.like_weight_ratio > 1.0) {
473  LOG_INFO << "Invalid placement with like_weight_ratio '" << p.like_weight_ratio
474  << "' not in [0.0, 1.0] at " << name << ".";
475  if (break_on_values) {
476  return false;
477  }
478  }
479  if (p.pendant_length < 0.0 || p.proximal_length < 0.0) {
480  LOG_INFO << "Invalid placement with pendant_length '" << p.pendant_length
481  << "' or proximal_length '" << p.proximal_length << "' < 0.0 at "
482  << name << ".";
483  if (break_on_values) {
484  return false;
485  }
486  }
487  if (p.proximal_length > edge_data.branch_length) {
488  LOG_INFO << "Invalid placement with proximal_length '" << p.proximal_length
489  << "' > branch_length '"
490  << edge_data.branch_length << "' at "
491  << name << ".";
492  if (break_on_values) {
493  return false;
494  }
495  }
496  ratio_sum += p.like_weight_ratio;
497  }
498 
499  // Check the sum of LWRs, with some small tolerance.
500  if (check_values && ratio_sum > 1.000000001) {
501  LOG_INFO << "Invalid pquery with sum of like_weight_ratio '" << ratio_sum
502  << "' > 1.0 at " << name << ".";
503  if (break_on_values) {
504  return false;
505  }
506  }
507 
508  // check names
509  if (check_values && pqry.name_size() == 0) {
510  LOG_INFO << "Pquery without any names at '" << name << "'.";
511  if (break_on_values) {
512  return false;
513  }
514  }
515  }
516 
517  return true;
518 }
519 
520 } // namespace placement
521 } // namespace genesis
IteratorEdges begin_edges()
Definition: tree/tree.cpp:498
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge...
IteratorEdges end_edges()
Definition: tree/tree.cpp:503
typename ContainerType< TreeEdge >::const_iterator ConstIteratorEdges
Definition: tree/tree.hpp:132
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
void rectify_values(Sample &sample)
Correct invalid values of the PqueryPlacements and PqueryNames as good as possible.
Tree operator functions.
bool has_consecutive_edge_nums(PlacementTree const &tree)
Verify that the PlacementTree has no duplicate edge_nums and that they form consecutive numbers start...
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
std::vector< double > placement_weight_per_edge(Sample const &sample)
Return a vector that contains the sum of the weights of the PqueryPlacements per edge of the tree of ...
Provides functions for working with Placements and Pqueries.
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1...
One placement position of a Pquery on a Tree.
std::vector< size_t > placement_count_per_edge(Sample const &sample)
Return a vector that contains the number of PqueryPlacements per edge of the tree of the Sample...
utils::Range< iterator_placements > placements()
Return a Range iterator to the PqueryPlacements.
Definition: pquery.cpp:73
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
void reset_edge_nums(PlacementTree &tree)
Reset all edge nums of a PlacementTree.
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
std::vector< std::vector< Pquery const * > > pqueries_per_edge(Sample const &sample, bool only_max_lwr_placements)
Return a mapping from each edge to the Pqueries on that edge.
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
Provides easy and fast logging functionality.
Header of PqueryPlain class.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
const PlacementTreeEdge & edge() const
Get the PlacementTreeEdge where this PqueryPlacement is placed.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:118
bool has_correct_edge_nums(PlacementTree const &tree)
Verify that the tree has correctly set edge nums.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98
bool validate(Sample const &smp, bool check_values, bool break_on_values)
Validate the integrity of the pointers, references and data in a Sample object.
int edge_num() const
Return the edge_num of this edge. This value is defined by the jplace standard.