A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/helper.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 
40 #include <algorithm>
41 #include <cassert>
42 #include <limits>
43 
44 #ifdef GENESIS_OPENMP
45 # include <omp.h>
46 #endif
47 
48 namespace genesis {
49 namespace placement {
50 
51 // =================================================================================================
52 // Helper Functions
53 // =================================================================================================
54 
55 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( PlacementTree const& tree )
56 {
57  auto en_map = std::unordered_map<int, PlacementTreeEdge*>();
58  for (
60  it != tree.end_edges();
61  ++it
62  ) {
63  auto const& edge = *it;
64  auto const& edge_data = edge->data<PlacementEdgeData>();
65  assert( en_map.count( edge_data.edge_num() ) == 0);
66  en_map.emplace( edge_data.edge_num(), edge.get());
67  }
68  return en_map;
69 }
70 
71 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( Sample const& smp )
72 {
73  return edge_num_to_edge_map( smp.tree() );
74 }
75 
76 std::vector<std::vector< Pquery const* >> pqueries_per_edge(
77  Sample const& sample,
78  bool only_max_lwr_placements
79 ) {
80  auto result = std::vector<std::vector< Pquery const* >>();
81  result.resize( sample.tree().edge_count() );
82 
83  for( auto const& pqry : sample.pqueries() ) {
84  if( only_max_lwr_placements ) {
85 
86  // If we are only interested in the most probably placement, find it first.
87  PqueryPlacement const* max_p = nullptr;
88  double max_v = std::numeric_limits<double>::lowest();
89  for( auto const& place : pqry.placements() ) {
90  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
91  max_v = place.like_weight_ratio;
92  max_p = &place;
93  }
94  }
95  // If there is one, add it to the list for its edge.
96  if( max_p ) {
97  result[ max_p->edge().index() ].push_back( &pqry );
98  }
99 
100  } else {
101  // If we instead want all placement, simply add them.
102  for( auto const& place : pqry.placements() ) {
103  result[ place.edge().index() ].push_back( &pqry );
104  }
105  }
106  }
107 
108  return result;
109 }
110 
111 std::vector< std::vector< PqueryPlacement const* >> placements_per_edge(
112  Sample const& smp,
113  bool only_max_lwr_placements
114 ) {
115  std::vector< std::vector< PqueryPlacement const* >> result;
116  result.resize( smp.tree().edge_count() );
117 
118  for( auto const& pqry : smp.pqueries() ) {
119  if( only_max_lwr_placements ) {
120 
121  // If we are only interested in the most probably placement, find it first.
122  PqueryPlacement const* max_p = nullptr;
123  double max_v = std::numeric_limits<double>::lowest();
124  for( auto const& place : pqry.placements() ) {
125  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
126  max_v = place.like_weight_ratio;
127  max_p = &place;
128  }
129  }
130  // If there is one, add it to the list for its edge.
131  if( max_p ) {
132  result[ max_p->edge().index() ].push_back( max_p );
133  }
134 
135  } else {
136  // If we instead want all placement, simply add them.
137  for( auto const& place : pqry.placements() ) {
138  result[ place.edge().index() ].push_back( &place );
139  }
140  }
141  }
142 
143  return result;
144 }
145 
146 std::vector<PqueryPlacement const*> placements_per_edge(
147  Sample const& smp,
148  PlacementTreeEdge const& edge
149 ) {
150  std::vector<PqueryPlacement const*> result;
151 
152  for( auto const& pqry : smp.pqueries() ) {
153  for( auto const& place : pqry.placements() ) {
154  if( &place.edge() == &edge ) {
155  result.push_back( &place );
156  }
157  }
158  }
159 
160  return result;
161 }
162 
163 std::vector<size_t> placement_count_per_edge( Sample const& sample )
164 {
165  auto result = std::vector<size_t>( sample.tree().edge_count(), 0 );
166 
167  for( auto const& pqry : sample.pqueries() ) {
168  for( auto const& place : pqry.placements() ) {
169  ++result[ place.edge().index() ];
170  }
171  }
172 
173  return result;
174 }
175 
177 {
178  // Basics.
179  auto const set_size = sample_set.size();
180  if( set_size == 0 ) {
181  return {};
182  }
183 
184  // Init matrix.
185  auto result = utils::Matrix<size_t>( set_size, sample_set[ 0 ].sample.tree().edge_count(), 0 );
186 
187  // Fill matrix.
188  #pragma omp parallel for
189  for( size_t i = 0; i < set_size; ++i ) {
190  for( auto const& pqry : sample_set[ i ].sample.pqueries() ) {
191  for( auto const& place : pqry.placements() ) {
192  ++result( i, place.edge().index() );
193  }
194  }
195  }
196 
197  return result;
198 }
199 
200 std::vector<PqueryPlain> plain_queries( Sample const & smp )
201 {
202  auto pqueries = std::vector<PqueryPlain>( smp.size() );
203 
204  #pragma omp parallel for
205  for (size_t i = 0; i < smp.size(); ++i) {
206  const auto& opqry = smp.at(i);
207 
208  pqueries[i].index = i;
209  pqueries[i].multiplicity = total_multiplicity( opqry );
210  pqueries[i].placements = std::vector<PqueryPlacementPlain>(opqry.placement_size());
211 
212  for (size_t j = 0; j < opqry.placement_size(); ++j) {
213  auto const& oplace = opqry.placement_at(j);
214  auto& place = pqueries[i].placements[j];
215 
216  place.edge_index = oplace.edge().index();
217  place.primary_node_index = oplace.edge().primary_node().index();
218  place.secondary_node_index = oplace.edge().secondary_node().index();
219 
220  auto const& oplace_data = oplace.edge().data<PlacementEdgeData>();
221  place.branch_length = oplace_data.branch_length;
222  place.pendant_length = oplace.pendant_length;
223  place.proximal_length = oplace.proximal_length;
224  place.like_weight_ratio = oplace.like_weight_ratio;
225  }
226  }
227  return pqueries;
228 }
229 
230 // =================================================================================================
231 // Verification
232 // =================================================================================================
233 
234 void rectify_values( Sample& sample )
235 {
236  for( auto& pqry : sample.pqueries() ) {
237 
238  // Rectify placement values
239  double lwr_sum = 0.0;
240  for( auto& p : pqry.placements() ) {
241 
242  // LWR
243  if( p.like_weight_ratio < 0.0 ) {
244  p.like_weight_ratio = 0.0;
245  }
246  if( p.like_weight_ratio > 1.0) {
247  p.like_weight_ratio = 1.0;
248  }
249  lwr_sum += p.like_weight_ratio;
250 
251  // Pendant length
252  if( p.pendant_length < 0.0 ) {
253  p.pendant_length = 0.0;
254  }
255 
256  // Proximal length
257  if( p.proximal_length < 0.0 ) {
258  p.proximal_length = 0.0;
259  }
260  if( p.proximal_length > p.edge().data<PlacementEdgeData>().branch_length ) {
261  p.proximal_length = p.edge().data<PlacementEdgeData>().branch_length;
262  }
263  }
264 
265  // If the total sum of LWRs is too big, rectify it.
266  if( lwr_sum > 1.0 ) {
267  normalize_weight_ratios( pqry );
268  }
269 
270  // Rectify name values
271  for( auto& n : pqry.names() ) {
272  // Negative multiplcities are invalid. As we don't know the actual value,
273  // better play it save and set it so 0, so that it is ignored,
274  // rather than setting it to 1, which would mean to fully use this name.
275  if( n.multiplicity < 0.0 ) {
276  n.multiplicity = 0.0;
277  }
278  }
279  }
280 }
281 
283 {
284  for( auto& smp : sset ) {
285  rectify_values( smp.sample );
286  }
287 }
288 
289 
291 {
292  // Edge numbers need to be in ascending order via postorder traversal.
293  int current = 0;
294  for( auto it : postorder(tree) ) {
295  // The last iteration is skipped, as the root does not have an edge.
296  if (it.is_last_iteration()) {
297  continue;
298  }
299 
300  it.edge().data<PlacementEdgeData>().reset_edge_num( current );
301  ++current;
302  }
303 }
304 
306 {
307  // List all edge nums.
308  auto order = std::vector<int>();
309  for( auto const& edge : tree.edges() ) {
310  order.push_back( edge->data<PlacementEdgeData>().edge_num() );
311  }
312 
313  // Sort them and use unique to see wether there are duplicates.
314  std::sort( order.begin(), order.end() );
315  if( std::unique( order.begin(), order.end() ) != order.end() ) {
316  return false;
317  }
318 
319  // Now check whether they are consecutive and start at 0.
320  if( order.empty() ) {
321  return true;
322  } else {
323  assert( order.size() >= 1 );
324  return order.front() == 0 && order.back() == static_cast<int>( order.size() - 1 );
325  }
326 }
327 
329 {
330  int current = 0;
331 
332  // Edge numbers need to be in ascending order via postorder traversal. Check this.
333  for( auto it : postorder(tree) ) {
334  // The last iteration is skipped, as the root does not have an edge.
335  if (it.is_last_iteration()) {
336  continue;
337  }
338 
339  if( it.edge().data<PlacementEdgeData>().edge_num() != current) {
340  return false;
341  }
342  ++current;
343  }
344 
345  return true;
346 }
347 
348 bool validate( Sample const& smp, bool check_values, bool break_on_values )
349 {
350  // check tree
351  if( ! tree::validate_topology( smp.tree() ) ) {
352  LOG_INFO << "Invalid placement tree topology.";
353  return false;
354  }
355  if( ! tree::tree_data_is< PlacementNodeData, PlacementEdgeData >( smp.tree() )) {
356  LOG_INFO << "Tree does not only contain Placement Node and Edge data types.";
357  return false;
358  }
359 
360  // check edges
361  std::unordered_map<int, PlacementTreeEdge*> edge_num_map;
362  for (
363  auto it_e = smp.tree().begin_edges();
364  it_e != smp.tree().end_edges();
365  ++it_e
366  ) {
367  // make sure every edge num is used once only
368  PlacementTreeEdge& edge = *(*it_e).get();
369  if( edge_num_map.count( edge.data<PlacementEdgeData>().edge_num() ) > 0 ) {
370  LOG_INFO << "More than one edge has edge_num '"
371  << edge.data<PlacementEdgeData>().edge_num() << "'.";
372  return false;
373  }
374  edge_num_map.emplace(
375  edge.data<PlacementEdgeData>().edge_num(), (*it_e).get()
376  );
377  }
378  if( ! has_correct_edge_nums( smp.tree() )) {
379  LOG_INFO << "Tree does not have correct edge nums.";
380  return false;
381  }
382 
383  // check pqueries
384  size_t pqry_place_count = 0;
385  for( auto const& pqry : smp.pqueries() ) {
386  // use this name for reporting invalid placements.
387  std::string name;
388  if (pqry.name_size() > 0) {
389  name = "'" + pqry.name_at(0).name + "'";
390  } else {
391  name = "(unnamed pquery)";
392  }
393 
394  // check placements
395  if (check_values && pqry.placement_size() == 0) {
396  LOG_INFO << "Pquery without any placements at '" << name << "'.";
397  if (break_on_values) {
398  return false;
399  }
400  }
401  double ratio_sum = 0.0;
402  for( auto pit = pqry.begin_placements(); pit != pqry.end_placements(); ++pit ) {
403  auto const& p = *pit;
404  auto const& edge_data = p.edge().data<PlacementEdgeData>();
405 
406  // Check if the placement has a valid pointer to its edge. This is a bit hard to do,
407  // as we use a raw pointer, so there is no easy way of telling whether it is valid
408  // or dangling. Instead, we simply check if the object behind it has the correct
409  // properties.
410  if(
411  p.edge().index() >= smp.tree().edge_count() ||
412  edge_num_map.count( edge_data.edge_num() ) == 0 ||
413  edge_data.edge_num() != p.edge_num()
414  ) {
415  LOG_INFO << "Invlaid edge pointer or edge num.";
416  return false;
417  }
418 
419  // now we know that all references between placements and edges are correct, so this
420  // assertion breaks only if we forgot to check some sort of weird inconsistency.
421  assert( edge_num_map.count( p.edge_num() ) > 0 );
422  ++pqry_place_count;
423 
424  // check numerical values
425  if (!check_values) {
426  continue;
427  }
428  if (p.like_weight_ratio < 0.0 || p.like_weight_ratio > 1.0) {
429  LOG_INFO << "Invalid placement with like_weight_ratio '" << p.like_weight_ratio
430  << "' not in [0.0, 1.0] at " << name << ".";
431  if (break_on_values) {
432  return false;
433  }
434  }
435  if (p.pendant_length < 0.0 || p.proximal_length < 0.0) {
436  LOG_INFO << "Invalid placement with pendant_length '" << p.pendant_length
437  << "' or proximal_length '" << p.proximal_length << "' < 0.0 at "
438  << name << ".";
439  if (break_on_values) {
440  return false;
441  }
442  }
443  if (p.proximal_length > edge_data.branch_length) {
444  LOG_INFO << "Invalid placement with proximal_length '" << p.proximal_length
445  << "' > branch_length '"
446  << edge_data.branch_length << "' at "
447  << name << ".";
448  if (break_on_values) {
449  return false;
450  }
451  }
452  ratio_sum += p.like_weight_ratio;
453  }
454 
455  // Check the sum of LWRs, with some small tolerance.
456  if (check_values && ratio_sum > 1.000000001) {
457  LOG_INFO << "Invalid pquery with sum of like_weight_ratio '" << ratio_sum
458  << "' > 1.0 at " << name << ".";
459  if (break_on_values) {
460  return false;
461  }
462  }
463 
464  // check names
465  if (check_values && pqry.name_size() == 0) {
466  LOG_INFO << "Pquery without any names at '" << name << "'.";
467  if (break_on_values) {
468  return false;
469  }
470  }
471  }
472 
473  return true;
474 }
475 
476 } // namespace placement
477 } // namespace genesis
IteratorEdges begin_edges()
Definition: tree/tree.cpp:498
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge...
IteratorEdges end_edges()
Definition: tree/tree.cpp:503
typename ContainerType< TreeEdge >::const_iterator ConstIteratorEdges
Definition: tree/tree.hpp:132
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:105
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
void rectify_values(Sample &sample)
Correct invalid values of the PqueryPlacements and PqueryNames as good as possible.
Tree operator functions.
bool has_consecutive_edge_nums(PlacementTree const &tree)
Verify that the PlacementTree has no duplicate edge_nums and that they form consecutive numbers start...
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
Provides functions for working with Placements and Pqueries.
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1...
One placement position of a Pquery on a Tree.
std::vector< size_t > placement_count_per_edge(Sample const &sample)
Return a vector that contains the number of PqueryPlacements per edge of the tree of the Sample...
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
Definition: masses.cpp:65
void reset_edge_nums(PlacementTree &tree)
Reset all edge nums of a PlacementTree.
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
std::vector< std::vector< Pquery const * > > pqueries_per_edge(Sample const &sample, bool only_max_lwr_placements)
Return a mapping from each edge to the Pqueries on that edge.
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
Provides easy and fast logging functionality.
Header of PqueryPlain class.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
const PlacementTreeEdge & edge() const
Get the PlacementTreeEdge where this PqueryPlacement is placed.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:183
bool has_correct_edge_nums(PlacementTree const &tree)
Verify that the tree has correctly set edge nums.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98
bool validate(Sample const &smp, bool check_values, bool break_on_values)
Validate the integrity of the pointers, references and data in a Sample object.
int edge_num() const
Return the edge_num of this edge. This value is defined by the jplace standard.