A library for working with phylogenetic and population genetic data.
v0.32.0
placement/function/helper.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 
40 #include <algorithm>
41 #include <cassert>
42 #include <limits>
43 
44 #ifdef GENESIS_OPENMP
45 # include <omp.h>
46 #endif
47 
48 namespace genesis {
49 namespace placement {
50 
51 // =================================================================================================
52 // Helper Functions
53 // =================================================================================================
54 
55 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( PlacementTree const& tree )
56 {
57  auto en_map = std::unordered_map<int, PlacementTreeEdge*>();
58  for( auto& edge : tree.edges() ) {
59  auto const& edge_data = edge.data<PlacementEdgeData>();
60  assert( en_map.count( edge_data.edge_num() ) == 0);
61  en_map.emplace( edge_data.edge_num(), &edge );
62  }
63  return en_map;
64 }
65 
66 std::unordered_map<int, PlacementTreeEdge*> edge_num_to_edge_map( Sample const& smp )
67 {
68  return edge_num_to_edge_map( smp.tree() );
69 }
70 
71 std::vector<std::vector< Pquery const* >> pqueries_per_edge(
72  Sample const& sample,
73  bool only_max_lwr_placements
74 ) {
75  auto result = std::vector<std::vector< Pquery const* >>();
76  result.resize( sample.tree().edge_count() );
77 
78  for( auto const& pqry : sample.pqueries() ) {
79  if( only_max_lwr_placements ) {
80 
81  // If we are only interested in the most probably placement, find it first.
82  PqueryPlacement const* max_p = nullptr;
83  double max_v = std::numeric_limits<double>::lowest();
84  for( auto const& place : pqry.placements() ) {
85  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
86  max_v = place.like_weight_ratio;
87  max_p = &place;
88  }
89  }
90  // If there is one, add it to the list for its edge.
91  if( max_p ) {
92  result[ max_p->edge().index() ].push_back( &pqry );
93  }
94 
95  } else {
96  // If we instead want all placement, simply add them.
97  for( auto const& place : pqry.placements() ) {
98  result[ place.edge().index() ].push_back( &pqry );
99  }
100  }
101  }
102 
103  return result;
104 }
105 
106 std::vector< std::vector< PqueryPlacement const* >> placements_per_edge(
107  Sample const& smp,
108  bool only_max_lwr_placements
109 ) {
110  std::vector< std::vector< PqueryPlacement const* >> result;
111  result.resize( smp.tree().edge_count() );
112 
113  for( auto const& pqry : smp.pqueries() ) {
114  if( only_max_lwr_placements ) {
115 
116  // If we are only interested in the most probably placement, find it first.
117  PqueryPlacement const* max_p = nullptr;
118  double max_v = std::numeric_limits<double>::lowest();
119  for( auto const& place : pqry.placements() ) {
120  if( max_p == nullptr || place.like_weight_ratio > max_v ) {
121  max_v = place.like_weight_ratio;
122  max_p = &place;
123  }
124  }
125  // If there is one, add it to the list for its edge.
126  if( max_p ) {
127  result[ max_p->edge().index() ].push_back( max_p );
128  }
129 
130  } else {
131  // If we instead want all placement, simply add them.
132  for( auto const& place : pqry.placements() ) {
133  result[ place.edge().index() ].push_back( &place );
134  }
135  }
136  }
137 
138  return result;
139 }
140 
141 std::vector<PqueryPlacement const*> placements_per_edge(
142  Sample const& smp,
143  PlacementTreeEdge const& edge
144 ) {
145  std::vector<PqueryPlacement const*> result;
146 
147  for( auto const& pqry : smp.pqueries() ) {
148  for( auto const& place : pqry.placements() ) {
149  if( &place.edge() == &edge ) {
150  result.push_back( &place );
151  }
152  }
153  }
154 
155  return result;
156 }
157 
158 std::vector<size_t> placement_count_per_edge( Sample const& sample )
159 {
160  auto result = std::vector<size_t>( sample.tree().edge_count(), 0 );
161 
162  for( auto const& pqry : sample.pqueries() ) {
163  for( auto const& place : pqry.placements() ) {
164  ++result[ place.edge().index() ];
165  }
166  }
167 
168  return result;
169 }
170 
172 {
173  // Basics.
174  auto const set_size = sample_set.size();
175  if( set_size == 0 ) {
176  return {};
177  }
178 
179  // Init matrix.
180  auto result = utils::Matrix<size_t>( set_size, sample_set[ 0 ].tree().edge_count(), 0 );
181 
182  // Fill matrix.
183  #pragma omp parallel for
184  for( size_t i = 0; i < set_size; ++i ) {
185  for( auto const& pqry : sample_set[ i ].pqueries() ) {
186  for( auto const& place : pqry.placements() ) {
187  ++result( i, place.edge().index() );
188  }
189  }
190  }
191 
192  return result;
193 }
194 
195 std::vector<PqueryPlain> plain_queries( Sample const & smp )
196 {
197  auto pqueries = std::vector<PqueryPlain>( smp.size() );
198 
199  #pragma omp parallel for
200  for (size_t i = 0; i < smp.size(); ++i) {
201  const auto& opqry = smp.at(i);
202 
203  pqueries[i].index = i;
204  pqueries[i].multiplicity = total_multiplicity( opqry );
205  pqueries[i].placements = std::vector<PqueryPlacementPlain>(opqry.placement_size());
206 
207  for (size_t j = 0; j < opqry.placement_size(); ++j) {
208  auto const& oplace = opqry.placement_at(j);
209  auto& place = pqueries[i].placements[j];
210 
211  place.edge_index = oplace.edge().index();
212  place.primary_node_index = oplace.edge().primary_node().index();
213  place.secondary_node_index = oplace.edge().secondary_node().index();
214 
215  auto const& oplace_data = oplace.edge().data<PlacementEdgeData>();
216  place.branch_length = oplace_data.branch_length;
217  place.pendant_length = oplace.pendant_length;
218  place.proximal_length = oplace.proximal_length;
219  place.like_weight_ratio = oplace.like_weight_ratio;
220  }
221  }
222  return pqueries;
223 }
224 
225 // =================================================================================================
226 // Verification
227 // =================================================================================================
228 
229 void rectify_values( Sample& sample )
230 {
231  for( auto& pqry : sample.pqueries() ) {
232 
233  // Rectify placement values
234  double lwr_sum = 0.0;
235  for( auto& p : pqry.placements() ) {
236 
237  // LWR
238  if( p.like_weight_ratio < 0.0 ) {
239  p.like_weight_ratio = 0.0;
240  }
241  if( p.like_weight_ratio > 1.0) {
242  p.like_weight_ratio = 1.0;
243  }
244  lwr_sum += p.like_weight_ratio;
245 
246  // Pendant length
247  if( p.pendant_length < 0.0 ) {
248  p.pendant_length = 0.0;
249  }
250 
251  // Proximal length
252  if( p.proximal_length < 0.0 ) {
253  p.proximal_length = 0.0;
254  }
255  if( p.proximal_length > p.edge().data<PlacementEdgeData>().branch_length ) {
256  p.proximal_length = p.edge().data<PlacementEdgeData>().branch_length;
257  }
258  }
259 
260  // If the total sum of LWRs is too big, rectify it.
261  if( lwr_sum > 1.0 ) {
262  normalize_weight_ratios( pqry );
263  }
264 
265  // Rectify name values
266  for( auto& n : pqry.names() ) {
267  // Negative multiplcities are invalid. As we don't know the actual value,
268  // better play it save and set it so 0, so that it is ignored,
269  // rather than setting it to 1, which would mean to fully use this name.
270  if( n.multiplicity < 0.0 ) {
271  n.multiplicity = 0.0;
272  }
273  }
274  }
275 }
276 
278 {
279  for( auto& smp : sset ) {
280  rectify_values( smp );
281  }
282 }
283 
284 
286 {
287  // Edge numbers need to be in ascending order via postorder traversal.
288  int current = 0;
289  for( auto it : postorder(tree) ) {
290  // The last iteration is skipped, as the root does not have an edge.
291  if (it.is_last_iteration()) {
292  continue;
293  }
294 
295  it.edge().data<PlacementEdgeData>().reset_edge_num( current );
296  ++current;
297  }
298 }
299 
301 {
302  // List all edge nums.
303  auto order = std::vector<int>();
304  for( auto const& edge : tree.edges() ) {
305  order.push_back( edge.data<PlacementEdgeData>().edge_num() );
306  }
307 
308  // Sort them and use unique to see wether there are duplicates.
309  std::sort( order.begin(), order.end() );
310  if( std::unique( order.begin(), order.end() ) != order.end() ) {
311  return false;
312  }
313 
314  // Now check whether they are consecutive and start at 0.
315  if( order.empty() ) {
316  return true;
317  } else {
318  assert( order.size() >= 1 );
319  return order.front() == 0 && order.back() == static_cast<int>( order.size() - 1 );
320  }
321 }
322 
324 {
325  int current = 0;
326 
327  // Edge numbers need to be in ascending order via postorder traversal. Check this.
328  for( auto it : postorder(tree) ) {
329  // The last iteration is skipped, as the root does not have an edge.
330  if (it.is_last_iteration()) {
331  continue;
332  }
333 
334  if( it.edge().data<PlacementEdgeData>().edge_num() != current) {
335  return false;
336  }
337  ++current;
338  }
339 
340  return true;
341 }
342 
343 bool validate( Sample const& smp, bool check_values, bool break_on_values )
344 {
345  // check tree
346  if( ! tree::validate_topology( smp.tree() ) ) {
347  LOG_INFO << "Invalid placement tree topology.";
348  return false;
349  }
350  if( ! tree::tree_data_is< PlacementNodeData, PlacementEdgeData >( smp.tree() )) {
351  LOG_INFO << "Tree does not only contain Placement Node and Edge data types.";
352  return false;
353  }
354 
355  // check edges
356  std::unordered_map<int, PlacementTreeEdge const*> edge_num_map;
357  for( auto const& edge : smp.tree().edges() ) {
358  // make sure every edge num is used once only
359  if( edge_num_map.count( edge.data<PlacementEdgeData>().edge_num() ) > 0 ) {
360  LOG_INFO << "More than one edge has edge_num '"
361  << edge.data<PlacementEdgeData>().edge_num() << "'.";
362  return false;
363  }
364  edge_num_map.emplace(
365  edge.data<PlacementEdgeData>().edge_num(), &edge
366  );
367  }
368  if( ! has_correct_edge_nums( smp.tree() )) {
369  LOG_INFO << "Tree does not have correct edge nums.";
370  return false;
371  }
372 
373  // check pqueries
374  for( auto const& pqry : smp.pqueries() ) {
375  // use this name for reporting invalid placements.
376  std::string name;
377  if (pqry.name_size() > 0) {
378  name = "'" + pqry.name_at(0).name + "'";
379  } else {
380  name = "(unnamed pquery)";
381  }
382 
383  // check placements
384  if (check_values && pqry.placement_size() == 0) {
385  LOG_INFO << "Pquery without any placements at '" << name << "'.";
386  if (break_on_values) {
387  return false;
388  }
389  }
390  double ratio_sum = 0.0;
391  for( auto pit = pqry.begin_placements(); pit != pqry.end_placements(); ++pit ) {
392  auto const& p = *pit;
393  auto const& edge_data = p.edge().data<PlacementEdgeData>();
394 
395  // Check if the placement has a valid pointer to its edge. This is a bit hard to do,
396  // as we use a raw pointer, so there is no easy way of telling whether it is valid
397  // or dangling. Instead, we simply check if the object behind it has the correct
398  // properties.
399  if(
400  p.edge().index() >= smp.tree().edge_count() ||
401  edge_num_map.count( edge_data.edge_num() ) == 0 ||
402  edge_data.edge_num() != p.edge_num()
403  ) {
404  LOG_INFO << "Invlaid edge pointer or edge num.";
405  return false;
406  }
407 
408  // now we know that all references between placements and edges are correct, so this
409  // assertion breaks only if we forgot to check some sort of weird inconsistency.
410  assert( edge_num_map.count( p.edge_num() ) > 0 );
411 
412  // check numerical values
413  if (!check_values) {
414  continue;
415  }
416  if (p.like_weight_ratio < 0.0 || p.like_weight_ratio > 1.0) {
417  LOG_INFO << "Invalid placement with like_weight_ratio '" << p.like_weight_ratio
418  << "' not in [0.0, 1.0] at " << name << ".";
419  if (break_on_values) {
420  return false;
421  }
422  }
423  if (p.pendant_length < 0.0 || p.proximal_length < 0.0) {
424  LOG_INFO << "Invalid placement with pendant_length '" << p.pendant_length
425  << "' or proximal_length '" << p.proximal_length << "' < 0.0 at "
426  << name << ".";
427  if (break_on_values) {
428  return false;
429  }
430  }
431  if (p.proximal_length > edge_data.branch_length) {
432  LOG_INFO << "Invalid placement with proximal_length '" << p.proximal_length
433  << "' > branch_length '"
434  << edge_data.branch_length << "' at "
435  << name << ".";
436  if (break_on_values) {
437  return false;
438  }
439  }
440  ratio_sum += p.like_weight_ratio;
441  }
442 
443  // Check the sum of LWRs, with some small tolerance.
444  if (check_values && ratio_sum > 1.000000001) {
445  LOG_INFO << "Invalid pquery with sum of like_weight_ratio '" << ratio_sum
446  << "' > 1.0 at " << name << ".";
447  if (break_on_values) {
448  return false;
449  }
450  }
451 
452  // check names
453  if (check_values && pqry.name_size() == 0) {
454  LOG_INFO << "Pquery without any names at '" << name << "'.";
455  if (break_on_values) {
456  return false;
457  }
458  }
459  }
460 
461  return true;
462 }
463 
464 } // namespace placement
465 } // namespace genesis
LOG_INFO
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:100
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::placement::placement_count_per_edge
std::vector< size_t > placement_count_per_edge(Sample const &sample)
Return a vector that contains the number of PqueryPlacements per edge of the tree of the Sample.
Definition: placement/function/helper.cpp:158
genesis::placement::SampleSet
Store a set of Samples with associated names.
Definition: sample_set.hpp:54
genesis::placement::PqueryPlacement::edge
PlacementTreeEdge const & edge() const
Get the PlacementTreeEdge where this PqueryPlacement is placed.
Definition: placement/pquery/placement.hpp:194
genesis::placement::Sample::size
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:138
genesis::placement::PlacementEdgeData::edge_num
EdgeNumType edge_num() const
Return the edge_num of this edge. This value is defined by the jplace standard.
Definition: placement_tree.hpp:194
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
genesis::placement::has_correct_edge_nums
bool has_correct_edge_nums(PlacementTree const &tree)
Verify that the tree has correctly set edge nums.
Definition: placement/function/helper.cpp:323
genesis::placement::pqueries_per_edge
std::vector< std::vector< Pquery const * > > pqueries_per_edge(Sample const &sample, bool only_max_lwr_placements)
Return a mapping from each edge to the Pqueries on that edge.
Definition: placement/function/helper.cpp:71
genesis::tree::TreeEdge::index
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:106
genesis::placement::plain_queries
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
Definition: placement/function/helper.cpp:195
functions.hpp
Provides functions for working with Placements and Pqueries.
genesis::placement::rectify_values
void rectify_values(Sample &sample)
Correct invalid values of the PqueryPlacements and PqueryNames as good as possible.
Definition: placement/function/helper.cpp:229
genesis::utils::Matrix< size_t >
genesis::placement::has_consecutive_edge_nums
bool has_consecutive_edge_nums(PlacementTree const &tree)
Verify that the PlacementTree has no duplicate edge_nums and that they form consecutive numbers start...
Definition: placement/function/helper.cpp:300
genesis::placement::PlacementEdgeData
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num,...
Definition: placement_tree.hpp:139
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::placement::edge_num_to_edge_map
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
Definition: placement/function/helper.cpp:55
genesis::placement::PqueryPlacement::like_weight_ratio
double like_weight_ratio
Likelihood weight ratio of this placement.
Definition: placement/pquery/placement.hpp:137
logging.hpp
Provides easy and fast logging functionality.
genesis::tree::CommonEdgeData::branch_length
double branch_length
Branch length of the edge.
Definition: tree/common_tree/tree.hpp:193
masses.hpp
genesis::placement::Sample::at
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:190
genesis::tree::TreeEdge
Definition: edge.hpp:60
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
operators.hpp
Tree operator functions.
genesis::tree::Tree::edges
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
genesis::placement::normalize_weight_ratios
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1....
Definition: placement/function/functions.cpp:123
genesis::placement::SampleSet::size
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
plain.hpp
Header of PqueryPlain class.
genesis::placement::reset_edge_nums
void reset_edge_nums(PlacementTree &tree)
Reset all edge nums of a PlacementTree.
Definition: placement/function/helper.cpp:285
genesis::placement::PqueryPlacement
One placement position of a Pquery on a Tree.
Definition: placement/pquery/placement.hpp:75
genesis::placement::placements_per_edge
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge.
Definition: placement/function/helper.cpp:106
postorder.hpp
genesis::placement::Sample::pqueries
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:264
genesis::tree::validate_topology
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
Definition: tree/function/operators.cpp:332
genesis::placement::total_multiplicity
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
Definition: masses.cpp:65
genesis::tree::edge_count
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
Definition: tree/function/functions.cpp:212
genesis::placement::validate
bool validate(Sample const &smp, bool check_values, bool break_on_values)
Validate the integrity of the pointers, references and data in a Sample object.
Definition: placement/function/helper.cpp:343
helper.hpp
genesis::tree::postorder
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
Definition: tree/iterator/postorder.hpp:321
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272