A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/helper.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <limits>
42 
43 #ifdef GENESIS_OPENMP
44 # include <omp.h>
45 #endif
46 
47 namespace genesis {
48 namespace placement {
49 
50 // =================================================================================================
51 // Helper Functions
52 // =================================================================================================
53 
54 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( PlacementTree const& tree )
55 {
56  auto en_map = std::unordered_map<int, PlacementTreeEdge*>();
57  for (
59  it != tree.end_edges();
60  ++it
61  ) {
62  auto const& edge = *it;
63  auto const& edge_data = edge->data<PlacementEdgeData>();
64  assert( en_map.count( edge_data.edge_num() ) == 0);
65  en_map.emplace( edge_data.edge_num(), edge.get());
66  }
67  return en_map;
68 }
69 
70 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( Sample const& smp )
71 {
72  return edge_num_to_edge_map( smp.tree() );
73 }
74 
75 std::vector<std::vector< Pquery const* >> pqueries_per_edge(
76  Sample const& sample,
77  bool only_max_lwr_placements
78 ) {
79  auto result = std::vector<std::vector< Pquery const* >>();
80  result.resize( sample.tree().edge_count() );
81 
82  for( auto const& pqry : sample.pqueries() ) {
83  if( only_max_lwr_placements ) {
84 
85  // If we are only interested in the most probably placement, find it first.
86  PqueryPlacement const* max_p = nullptr;
87  double max_v = std::numeric_limits<double>::lowest();
88  for( auto const& place : pqry.placements() ) {
89  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
90  max_v = place.like_weight_ratio;
91  max_p = &place;
92  }
93  }
94  // If there is one, add it to the list for its edge.
95  if( max_p ) {
96  result[ max_p->edge().index() ].push_back( &pqry );
97  }
98 
99  } else {
100  // If we instead want all placement, simply add them.
101  for( auto const& place : pqry.placements() ) {
102  result[ place.edge().index() ].push_back( &pqry );
103  }
104  }
105  }
106 
107  return result;
108 }
109 
110 std::vector< std::vector< PqueryPlacement const* >> placements_per_edge(
111  Sample const& smp,
112  bool only_max_lwr_placements
113 ) {
114  std::vector< std::vector< PqueryPlacement const* >> result;
115  result.resize( smp.tree().edge_count() );
116 
117  for( auto const& pqry : smp.pqueries() ) {
118  if( only_max_lwr_placements ) {
119 
120  // If we are only interested in the most probably placement, find it first.
121  PqueryPlacement const* max_p = nullptr;
122  double max_v = std::numeric_limits<double>::lowest();
123  for( auto const& place : pqry.placements() ) {
124  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
125  max_v = place.like_weight_ratio;
126  max_p = &place;
127  }
128  }
129  // If there is one, add it to the list for its edge.
130  if( max_p ) {
131  result[ max_p->edge().index() ].push_back( max_p );
132  }
133 
134  } else {
135  // If we instead want all placement, simply add them.
136  for( auto const& place : pqry.placements() ) {
137  result[ place.edge().index() ].push_back( &place );
138  }
139  }
140  }
141 
142  return result;
143 }
144 
145 std::vector<PqueryPlacement const*> placements_per_edge(
146  Sample const& smp,
147  PlacementTreeEdge const& edge
148 ) {
149  std::vector<PqueryPlacement const*> result;
150 
151  for( auto const& pqry : smp.pqueries() ) {
152  for( auto const& place : pqry.placements() ) {
153  if( &place.edge() == &edge ) {
154  result.push_back( &place );
155  }
156  }
157  }
158 
159  return result;
160 }
161 
162 std::vector<size_t> placement_count_per_edge( Sample const& sample )
163 {
164  auto result = std::vector<size_t>( sample.tree().edge_count(), 0 );
165 
166  for( auto const& pqry : sample.pqueries() ) {
167  for( auto const& place : pqry.placements() ) {
168  ++result[ place.edge().index() ];
169  }
170  }
171 
172  return result;
173 }
174 
176 {
177  // Basics.
178  auto const set_size = sample_set.size();
179  if( set_size == 0 ) {
180  return {};
181  }
182 
183  // Init matrix.
184  auto result = utils::Matrix<size_t>( set_size, sample_set[ 0 ].sample.tree().edge_count(), 0 );
185 
186  // Fill matrix.
187  #pragma omp parallel for
188  for( size_t i = 0; i < set_size; ++i ) {
189  for( auto const& pqry : sample_set[ i ].sample.pqueries() ) {
190  for( auto const& place : pqry.placements() ) {
191  ++result( i, place.edge().index() );
192  }
193  }
194  }
195 
196  return result;
197 }
198 
199 std::vector<double> placement_weight_per_edge( Sample const& sample )
200 {
201  auto result = std::vector<double>( sample.tree().edge_count(), 0.0 );
202 
203  for( auto const& pqry : sample.pqueries() ) {
204  for( auto const& place : pqry.placements() ) {
205  result[ place.edge().index() ] += place.like_weight_ratio;
206  }
207  }
208 
209  return result;
210 }
211 
213 {
214  // Basics.
215  auto const set_size = sample_set.size();
216  if( set_size == 0 ) {
217  return {};
218  }
219 
220  // Init matrix.
221  auto result = utils::Matrix<double>( set_size, sample_set[ 0 ].sample.tree().edge_count(), 0.0 );
222 
223  // Fill matrix.
224  #pragma omp parallel for
225  for( size_t i = 0; i < set_size; ++i ) {
226  for( auto const& pqry : sample_set[ i ].sample.pqueries() ) {
227  for( auto const& place : pqry.placements() ) {
228  result( i, place.edge().index() ) += place.like_weight_ratio;
229  }
230  }
231  }
232 
233  return result;
234 }
235 
236 std::vector<PqueryPlain> plain_queries( Sample const & smp )
237 {
238  auto pqueries = std::vector<PqueryPlain>( smp.size() );
239 
240  #pragma omp parallel for
241  for (size_t i = 0; i < smp.size(); ++i) {
242  pqueries[i].index = i;
243 
244  const auto& opqry = smp.at(i);
245  pqueries[i].placements = std::vector<PqueryPlacementPlain>(opqry.placement_size());
246 
247  for (size_t j = 0; j < opqry.placement_size(); ++j) {
248  auto const& oplace = opqry.placement_at(j);
249  auto& place = pqueries[i].placements[j];
250 
251  place.edge_index = oplace.edge().index();
252  place.primary_node_index = oplace.edge().primary_node().index();
253  place.secondary_node_index = oplace.edge().secondary_node().index();
254 
255  auto const& oplace_data = oplace.edge().data<PlacementEdgeData>();
256  place.branch_length = oplace_data.branch_length;
257  place.pendant_length = oplace.pendant_length;
258  place.proximal_length = oplace.proximal_length;
259  place.like_weight_ratio = oplace.like_weight_ratio;
260  }
261  }
262  return pqueries;
263 }
264 
265 // =================================================================================================
266 // Verification
267 // =================================================================================================
268 
269 void rectify_values( Sample& sample )
270 {
271  for( auto& pqry : sample.pqueries() ) {
272 
273  // Rectify placement values
274  double lwr_sum = 0.0;
275  for( auto& p : pqry.placements() ) {
276 
277  // LWR
278  if( p.like_weight_ratio < 0.0 ) {
279  p.like_weight_ratio = 0.0;
280  }
281  if( p.like_weight_ratio > 1.0) {
282  p.like_weight_ratio = 1.0;
283  }
284  lwr_sum += p.like_weight_ratio;
285 
286  // Pendant length
287  if( p.pendant_length < 0.0 ) {
288  p.pendant_length = 0.0;
289  }
290 
291  // Proximal length
292  if( p.proximal_length < 0.0 ) {
293  p.proximal_length = 0.0;
294  }
295  if( p.proximal_length > p.edge().data<PlacementEdgeData>().branch_length ) {
296  p.proximal_length = p.edge().data<PlacementEdgeData>().branch_length;
297  }
298  }
299 
300  // If the total sum of LWRs is too big, rectify it.
301  if( lwr_sum > 1.0 ) {
302  normalize_weight_ratios( pqry );
303  }
304 
305  // Rectify name values
306  for( auto& n : pqry.names() ) {
307  // Negative multiplcities are invalid. As we don't know the actual value,
308  // better play it save and set it so 0, so that it is ignored,
309  // rather than setting it to 1, which would mean to fully use this name.
310  if( n.multiplicity < 0.0 ) {
311  n.multiplicity = 0.0;
312  }
313  }
314  }
315 }
316 
318 {
319  for( auto& smp : sset ) {
320  rectify_values( smp.sample );
321  }
322 }
323 
324 
326 {
327  // Edge numbers need to be in ascending order via postorder traversal.
328  int current = 0;
329  for( auto it : postorder(tree) ) {
330  // The last iteration is skipped, as the root does not have an edge.
331  if (it.is_last_iteration()) {
332  continue;
333  }
334 
335  it.edge().data<PlacementEdgeData>().reset_edge_num( current );
336  ++current;
337  }
338 }
339 
341 {
342  // List all edge nums.
343  auto order = std::vector<int>();
344  for( auto const& edge : tree.edges() ) {
345  order.push_back( edge->data<PlacementEdgeData>().edge_num() );
346  }
347 
348  // Sort them and use unique to see wether there are duplicates.
349  std::sort( order.begin(), order.end() );
350  if( std::unique( order.begin(), order.end() ) != order.end() ) {
351  return false;
352  }
353 
354  // Now check whether they are consecutive and start at 0.
355  if( order.empty() ) {
356  return true;
357  } else {
358  assert( order.size() >= 1 );
359  return order.front() == 0 && order.back() == static_cast<int>( order.size() - 1 );
360  }
361 }
362 
364 {
365  int current = 0;
366 
367  // Edge numbers need to be in ascending order via postorder traversal. Check this.
368  for( auto it : postorder(tree) ) {
369  // The last iteration is skipped, as the root does not have an edge.
370  if (it.is_last_iteration()) {
371  continue;
372  }
373 
374  if( it.edge().data<PlacementEdgeData>().edge_num() != current) {
375  return false;
376  }
377  ++current;
378  }
379 
380  return true;
381 }
382 
383 bool validate( Sample const& smp, bool check_values, bool break_on_values )
384 {
385  // check tree
386  if( ! tree::validate_topology( smp.tree() ) ) {
387  LOG_INFO << "Invalid placement tree topology.";
388  return false;
389  }
390  if( ! tree::tree_data_is< PlacementNodeData, PlacementEdgeData >( smp.tree() )) {
391  LOG_INFO << "Tree does not only contain Placement Node and Edge data types.";
392  return false;
393  }
394 
395  // check edges
396  std::unordered_map<int, PlacementTreeEdge*> edge_num_map;
397  for (
398  auto it_e = smp.tree().begin_edges();
399  it_e != smp.tree().end_edges();
400  ++it_e
401  ) {
402  // make sure every edge num is used once only
403  PlacementTreeEdge& edge = *(*it_e).get();
404  if( edge_num_map.count( edge.data<PlacementEdgeData>().edge_num() ) > 0 ) {
405  LOG_INFO << "More than one edge has edge_num '"
406  << edge.data<PlacementEdgeData>().edge_num() << "'.";
407  return false;
408  }
409  edge_num_map.emplace(
410  edge.data<PlacementEdgeData>().edge_num(), (*it_e).get()
411  );
412  }
413  if( ! has_correct_edge_nums( smp.tree() )) {
414  LOG_INFO << "Tree does not have correct edge nums.";
415  return false;
416  }
417 
418  // check pqueries
419  size_t pqry_place_count = 0;
420  for( auto const& pqry : smp.pqueries() ) {
421  // use this name for reporting invalid placements.
422  std::string name;
423  if (pqry.name_size() > 0) {
424  name = "'" + pqry.name_at(0).name + "'";
425  } else {
426  name = "(unnamed pquery)";
427  }
428 
429  // check placements
430  if (check_values && pqry.placement_size() == 0) {
431  LOG_INFO << "Pquery without any placements at '" << name << "'.";
432  if (break_on_values) {
433  return false;
434  }
435  }
436  double ratio_sum = 0.0;
437  for( auto pit = pqry.begin_placements(); pit != pqry.end_placements(); ++pit ) {
438  auto const& p = *pit;
439  auto const& edge_data = p.edge().data<PlacementEdgeData>();
440 
441  // Check if the placement has a valid pointer to its edge. This is a bit hard to do,
442  // as we use a raw pointer, so there is no easy way of telling whether it is valid
443  // or dangling. Instead, we simply check if the object behind it has the correct
444  // properties.
445  if(
446  p.edge().index() >= smp.tree().edge_count() ||
447  edge_num_map.count( edge_data.edge_num() ) == 0 ||
448  edge_data.edge_num() != p.edge_num()
449  ) {
450  LOG_INFO << "Invlaid edge pointer or edge num.";
451  return false;
452  }
453 
454  // now we know that all references between placements and edges are correct, so this
455  // assertion breaks only if we forgot to check some sort of weird inconsistency.
456  assert( edge_num_map.count( p.edge_num() ) > 0 );
457  ++pqry_place_count;
458 
459  // check numerical values
460  if (!check_values) {
461  continue;
462  }
463  if (p.like_weight_ratio < 0.0 || p.like_weight_ratio > 1.0) {
464  LOG_INFO << "Invalid placement with like_weight_ratio '" << p.like_weight_ratio
465  << "' not in [0.0, 1.0] at " << name << ".";
466  if (break_on_values) {
467  return false;
468  }
469  }
470  if (p.pendant_length < 0.0 || p.proximal_length < 0.0) {
471  LOG_INFO << "Invalid placement with pendant_length '" << p.pendant_length
472  << "' or proximal_length '" << p.proximal_length << "' < 0.0 at "
473  << name << ".";
474  if (break_on_values) {
475  return false;
476  }
477  }
478  if (p.proximal_length > edge_data.branch_length) {
479  LOG_INFO << "Invalid placement with proximal_length '" << p.proximal_length
480  << "' > branch_length '"
481  << edge_data.branch_length << "' at "
482  << name << ".";
483  if (break_on_values) {
484  return false;
485  }
486  }
487  ratio_sum += p.like_weight_ratio;
488  }
489 
490  // Check the sum of LWRs, with some small tolerance.
491  if (check_values && ratio_sum > 1.000000001) {
492  LOG_INFO << "Invalid pquery with sum of like_weight_ratio '" << ratio_sum
493  << "' > 1.0 at " << name << ".";
494  if (break_on_values) {
495  return false;
496  }
497  }
498 
499  // check names
500  if (check_values && pqry.name_size() == 0) {
501  LOG_INFO << "Pquery without any names at '" << name << "'.";
502  if (break_on_values) {
503  return false;
504  }
505  }
506  }
507 
508  return true;
509 }
510 
511 } // namespace placement
512 } // namespace genesis
IteratorEdges begin_edges()
Definition: tree/tree.cpp:498
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge...
IteratorEdges end_edges()
Definition: tree/tree.cpp:503
typename ContainerType< TreeEdge >::const_iterator ConstIteratorEdges
Definition: tree/tree.hpp:132
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
void rectify_values(Sample &sample)
Correct invalid values of the PqueryPlacements and PqueryNames as good as possible.
Tree operator functions.
bool has_consecutive_edge_nums(PlacementTree const &tree)
Verify that the PlacementTree has no duplicate edge_nums and that they form consecutive numbers start...
const PlacementTreeEdge & edge() const
Get the PlacementTreeEdge where this PqueryPlacement is placed.
Definition: placement.cpp:58
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
std::vector< double > placement_weight_per_edge(Sample const &sample)
Return a vector that contains the sum of the weights of the PqueryPlacements per edge of the tree of ...
Provides functions for working with Placements and Pqueries.
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1...
One placement position of a Pquery on a Tree.
std::vector< size_t > placement_count_per_edge(Sample const &sample)
Return a vector that contains the number of PqueryPlacements per edge of the tree of the Sample...
utils::Range< iterator_placements > placements()
Return a Range iterator to the PqueryPlacements.
Definition: pquery.cpp:82
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
void reset_edge_nums(PlacementTree &tree)
Reset all edge nums of a PlacementTree.
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
std::vector< std::vector< Pquery const * > > pqueries_per_edge(Sample const &sample, bool only_max_lwr_placements)
Return a mapping from each edge to the Pqueries on that edge.
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
Provides easy and fast logging functionality.
Header of PqueryPlain class.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.cpp:135
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:118
bool has_correct_edge_nums(PlacementTree const &tree)
Verify that the tree has correctly set edge nums.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98
bool validate(Sample const &smp, bool check_values, bool break_on_values)
Validate the integrity of the pointers, references and data in a Sample object.
int edge_num() const
Return the edge_num of this edge. This value is defined by the jplace standard.