A library for working with phylogenetic and population genetic data.
v0.27.0
placement/function/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
42 
43 #include <algorithm>
44 #include <cassert>
45 #include <cmath>
46 #include <exception>
47 #include <regex>
48 #include <string>
49 #include <unordered_map>
50 #include <unordered_set>
51 #include <vector>
52 
53 #ifdef GENESIS_OPENMP
54 # include <omp.h>
55 #endif
56 
57 namespace genesis {
58 namespace placement {
59 
60 // =================================================================================================
61 // Pquery Names
62 // =================================================================================================
63 
64 bool has_name( Pquery const& pquery, std::string const& name )
65 {
66  for( size_t i = 0; i < pquery.name_size(); ++i ) {
67  if( pquery.name_at(i).name == name ) {
68  return true;
69  }
70  }
71  return false;
72 }
73 
74 bool has_name( Sample const& smp, std::string const& name )
75 {
76  for( auto const& pqry : smp.pqueries() ) {
77  if( has_name( pqry, name ) ) {
78  return true;
79  }
80  }
81  return false;
82 }
83 
84 Pquery const* find_pquery( Sample const& smp, std::string const& name )
85 {
86  // TODO instead of pointer, return an iterator!
87  // then use find if directly!
88  for( auto const& pqry : smp.pqueries() ) {
89  if( has_name( pqry, name )) {
90  return &pqry;
91  }
92  }
93  return nullptr;
94 }
95 
96 Pquery* find_pquery( Sample& smp, std::string const& name )
97 {
98  // TODO instead of pointer, return an iterator!
99  // then use find if directly!
100  for( auto& pqry : smp.pqueries() ) {
101  if( has_name( pqry, name )) {
102  return &pqry;
103  }
104  }
105  return nullptr;
106 }
107 
108 std::unordered_set<std::string> all_pquery_names( Sample const& sample )
109 {
110  std::unordered_set<std::string> result;
111  for( auto const& pquery : sample ) {
112  for( auto const& pname : pquery.names() ) {
113  result.insert( pname.name );
114  }
115  }
116  return result;
117 }
118 
119 // =================================================================================================
120 // Normalization and Sorting
121 // =================================================================================================
122 
124 {
125  double sum = 0.0;
126  for( auto const& place : pquery.placements() ) {
127  sum += place.like_weight_ratio;
128  }
129  if( sum == 0.0 ) {
130  throw std::overflow_error( "Cannot normalize weight ratios if all of them are zero." );
131  }
132  for( auto& place : pquery.placements() ) {
133  place.like_weight_ratio /= sum;
134  }
135 }
136 
138 {
139  for( auto pqry_it = smp.begin(); pqry_it != smp.end(); ++pqry_it ) {
140  normalize_weight_ratios( *pqry_it );
141  }
142 }
143 
144 // void sort_placements_by_proximal_length( PlacementTreeEdge& edge );
145 // void sort_placements_by_proximal_length( Sample& smp );
146 
148 {
149  std::sort(
150  pquery.begin_placements(),
151  pquery.end_placements(),
152  [] (
153  PqueryPlacement const& lhs,
154  PqueryPlacement const& rhs
155  ) {
156  return lhs.like_weight_ratio > rhs.like_weight_ratio;
157  }
158  );
159 }
160 
162 {
163  for( auto pqry_it = smp.begin(); pqry_it != smp.end(); ++pqry_it ) {
164  sort_placements_by_weight( *pqry_it );
165  }
166 }
167 
168 void scale_all_branch_lengths( Sample& smp, double factor )
169 {
170  tree::scale_all_branch_lengths( smp.tree(), factor );
171  for( auto& pqry : smp.pqueries() ) {
172  for( auto& place : pqry.placements() ) {
173  place.proximal_length *= factor;
174  }
175  }
176 }
177 
178 void adjust_branch_lengths( Sample& sample, tree::Tree const& source )
179 {
180  // Basic error check.
181  if( sample.tree().edge_count() != source.edge_count() ) {
182  throw std::runtime_error( "Sizes of the Sample::tree() and source tree differ." );
183  }
184 
185  // First, adjust the placement positions,
186  // because we need the old branch lengths of the sample for correct scaling.
187  #pragma omp parallel for
188  for( size_t i = 0; i < sample.size(); ++i ) {
189  auto& pquery = sample.at(i);
190 
191  for( auto& placement : pquery.placements() ) {
192  auto const edge_idx = placement.edge().index();
193  auto const old_bl = placement.edge().data<PlacementEdgeData>().branch_length;
194  auto const new_bl = source.edge_at(edge_idx).data<tree::CommonEdgeData>().branch_length;
195 
196  placement.proximal_length *= new_bl / old_bl;
197  }
198  }
199 
200  // Now adjust the reference tree branch lengths.
201  #pragma omp parallel for
202  for( size_t edge_idx = 0; edge_idx < source.edge_count(); ++edge_idx ) {
203  auto const new_bl = source.edge_at(edge_idx).data<tree::CommonEdgeData>().branch_length;
204  sample.tree().edge_at(edge_idx).data<PlacementEdgeData>().branch_length = new_bl;
205  }
206 }
207 
208 // =================================================================================================
209 // Filtering Placements
210 // =================================================================================================
211 
212 void filter_min_accumulated_weight( Pquery& pquery, double threshold )
213 {
214  // Sort, so that the most likely placements are in the beginning.
215  sort_placements_by_weight( pquery );
216 
217  // Find the position where enough weight is accumulated.
218  size_t i = 0;
219  double weight_sum = 0.0;
220  do {
221  weight_sum += pquery.placement_at(i).like_weight_ratio;
222  ++i;
223  } while( weight_sum < threshold && i < pquery.placement_size() );
224  assert( i > 0 );
225 
226  // Remove the rest.
227  if( i < pquery.placement_size() ) {
228  auto& placements = pquery.expose_placements();
229  placements.erase( placements.begin() + i, placements.end() );
230  }
231  assert( pquery.placement_size() == i );
232 
233  // Slow old version.
234  // while( pquery.placement_size() > i ) {
235  // pquery.remove_placement_at( pquery.placement_size() - 1 );
236  // }
237 }
238 
239 void filter_min_accumulated_weight( Sample& smp, double threshold )
240 {
241  for( auto& pquery : smp ) {
242  filter_min_accumulated_weight( pquery, threshold );
243  }
244 }
245 
246 void filter_n_max_weight_placements( Pquery& pquery, size_t n )
247 {
248  // Check if there is anything to do at all.
249  if( pquery.placement_size() == 0 || pquery.placement_size() <= n ) {
250  return;
251  }
252 
253  // Sort.
254  sort_placements_by_weight( pquery );
255 
256  // Remove the rest.
257  assert( pquery.placement_size() > n );
258  auto& placements = pquery.expose_placements();
259  placements.erase( placements.begin() + n, placements.end() );
260  assert( pquery.placement_size() == n );
261 
262  // Slow old version.
263  // while( pquery.placement_size() > n ) {
264  // pquery.remove_placement_at( pquery.placement_size() - 1 );
265  // }
266 }
267 
269 {
270  for( auto& pquery : smp ) {
271  filter_n_max_weight_placements( pquery, n );
272  }
273 }
274 
275 void filter_min_weight_threshold( Pquery& pquery, double threshold )
276 {
277  auto& placements = pquery.expose_placements();
278  utils::erase_if( placements, [&]( PqueryPlacement const& placement ){
279  return placement.like_weight_ratio < threshold;
280  });
281 
282  // Slow old version.
283  // This is certainly not the cleanest solution, as it might cause quite some relocations.
284  // However, we'd need full write access to the placement vector, which is currently not
285  // supported for reasons of encapsulation...
286  // size_t i = 0;
287  // while( i < pquery.placement_size() ) {
288  // if( pquery.placement_at(i).like_weight_ratio < threshold ) {
289  // pquery.remove_placement_at(i);
290  // } else {
291  // ++i;
292  // }
293  // }
294 }
295 
296 void filter_min_weight_threshold( Sample& smp, double threshold )
297 {
298  for( auto& pquery : smp ) {
299  filter_min_weight_threshold( pquery, threshold );
300  }
301 }
302 
303 void filter_min_pendant_length( Pquery& pquery, double threshold )
304 {
305  auto& placements = pquery.expose_placements();
306  utils::erase_if( placements, [&]( PqueryPlacement const& placement ){
307  return placement.pendant_length < threshold;
308  });
309 }
310 
311 void filter_min_pendant_length( Sample& sample, double threshold )
312 {
313  for( auto& pquery : sample ) {
314  filter_min_pendant_length( pquery, threshold );
315  }
316 }
317 
318 void filter_max_pendant_length( Pquery& pquery, double threshold )
319 {
320  auto& placements = pquery.expose_placements();
321  utils::erase_if( placements, [&]( PqueryPlacement const& placement ){
322  return placement.pendant_length > threshold;
323  });
324 }
325 
326 void filter_max_pendant_length( Sample& sample, double threshold )
327 {
328  for( auto& pquery : sample ) {
329  filter_max_pendant_length( pquery, threshold );
330  }
331 }
332 
334 {
335  size_t cnt = 0;
336  auto new_end_pqueries = std::remove_if(
337  sample.begin(),
338  sample.end(),
339  [&]( Pquery& pqry ){
340  cnt += static_cast<int>( pqry.placement_size() == 0 );
341  return pqry.placement_size() == 0;
342  }
343  );
344  assert( cnt == static_cast<size_t>( sample.end() - new_end_pqueries ));
345  sample.remove( new_end_pqueries, sample.end() );
346  return cnt;
347 
348  // Slow version.
349  // size_t i = 0;
350  // size_t r = 0;
351  // while( i < sample.size() ) {
352  // if( sample.at(i).placement_size() == 0 ) {
353  // sample.remove( i );
354  // ++r;
355  // } else {
356  // ++i;
357  // }
358  // }
359  // return r;
360 }
361 
362 // =================================================================================================
363 // Filtering Names
364 // =================================================================================================
365 
366 // template<typename F>
367 // void filter_pqueries_by_name( Sample& smp, F predicate )
368 // {
369 // // We do a neat trick here: Iterate all pqueries of the sample via a std::remove if,
370 // // and for each of them, remove all names that we don't want to keep via another std::remove_if.
371 // // Then, if no names remain, and the remove_empty_name_pqueries flag is set, we let the outer
372 // // remove_if also remove the whole pquery.
373 // auto new_end_pqueries = std::remove_if(
374 // smp.begin(),
375 // smp.end(),
376 // [&]( Pquery& pqry ){
377 // // Remove names all where the name does not match.
378 // auto& name_container = pqry.expose_names();
379 // auto new_end_names = std::remove_if(
380 // name_container.begin(),
381 // name_container.end(),
382 // [&]( PqueryName const& nm ){
383 // return ! predicate( nm.name );
384 // }
385 // );
386 // name_container.erase( new_end_names, name_container.end() );
387 //
388 // // If there are no names left, the whole pquery can be removed.
389 // return name_container.empty();
390 // }
391 // );
392 // smp.remove( new_end_pqueries, smp.end() );
393 // }
394 
395 template<typename F>
397  Sample& smp,
398  F predicate,
399  bool remove_empty_pqueries
400 ) {
401  for( auto& pquery : smp ) {
402  // Remove names all where the name does not match.
403  auto& name_container = pquery.expose_names();
404  auto new_end_names = std::remove_if(
405  name_container.begin(),
406  name_container.end(),
407  [&]( PqueryName const& nm ){
408  return ! predicate( nm.name );
409  }
410  );
411  name_container.erase( new_end_names, name_container.end() );
412  }
413  if( remove_empty_pqueries ) {
415  }
416 }
417 
419  Sample& smp,
420  std::string const& regex,
421  bool remove_empty_pqueries
422 ) {
423  std::regex pattern( regex );
424  filter_pqueries_by_name_( smp, [&]( std::string const& name ){
425  return regex_match( name, pattern );
426  }, remove_empty_pqueries );
427 }
428 
430  Sample& smp,
431  std::unordered_set<std::string> const& keep_list,
432  bool remove_empty_pqueries
433 ) {
434  filter_pqueries_by_name_( smp, [&]( std::string const& name ){
435  return keep_list.count( name ) > 0;
436  }, remove_empty_pqueries );
437 }
438 
440  Sample& smp,
441  std::string const& regex,
442  bool remove_empty_pqueries
443 ) {
444  std::regex pattern( regex );
445  filter_pqueries_by_name_( smp, [&]( std::string const& name ){
446  return ! regex_match( name, pattern );
447  }, remove_empty_pqueries );
448 }
449 
451  Sample& smp,
452  std::unordered_set<std::string> const& remove_list,
453  bool remove_empty_pqueries
454 ) {
455  filter_pqueries_by_name_( smp, [&]( std::string const& name ){
456  return remove_list.count( name ) == 0;
457  }, remove_empty_pqueries );
458 }
459 
461  Sample& sample_1,
462  Sample& sample_2,
463  bool remove_empty_pqueries
464 ) {
465  // Remove those pqueries from sample_2 which do not occur in sample_1.
466  auto names = all_pquery_names( sample_1 );
467  filter_pqueries_keeping_names( sample_2, names, remove_empty_pqueries );
468 
469  // And vice versa (using the names of the already smaller sample_2).
470  names = all_pquery_names( sample_2 );
471  filter_pqueries_keeping_names( sample_1, names, remove_empty_pqueries );
472 }
473 
475  Sample& sample_1,
476  Sample& sample_2,
477  bool remove_empty_pqueries
478 ) {
479  // Yep, yep, this is wasteful. But it works for now.
480 
481  // Get all sorted names in sample 1 ...
482  auto names_1_u = all_pquery_names( sample_1 );
483  auto names_1 = std::vector< std::string >( names_1_u.begin(), names_1_u.end() );
484  std::sort( names_1.begin(), names_1.end() );
485  names_1_u.clear();
486 
487  // ... and in sample 2.
488  auto names_2_u = all_pquery_names( sample_2 );
489  auto names_2 = std::vector< std::string >( names_2_u.begin(), names_2_u.end() );
490  std::sort( names_2.begin(), names_2.end() );
491  names_2_u.clear();
492 
493  // Get their intersection.
494  std::unordered_set< std::string > symdiff;
495  std::set_intersection(
496  names_1.begin(), names_1.end(),
497  names_2.begin(), names_2.end(),
498  std::inserter( symdiff, symdiff.end() )
499  );
500 
501  // Remove all intersecting elements from the sampels.
502  filter_pqueries_removing_names( sample_1, symdiff, remove_empty_pqueries );
503  filter_pqueries_removing_names( sample_2, symdiff, remove_empty_pqueries );
504 }
505 
507 {
508  size_t cnt = 0;
509  auto new_end_pqueries = std::remove_if(
510  sample.begin(),
511  sample.end(),
512  [&]( Pquery& pqry ){
513  cnt += static_cast<int>( pqry.name_size() == 0 );
514  return pqry.name_size() == 0;
515  }
516  );
517  assert( cnt == static_cast<size_t>( sample.end() - new_end_pqueries ));
518  sample.remove( new_end_pqueries, sample.end() );
519  return cnt;
520 
521  // Slow version.
522  // size_t i = 0;
523  // size_t r = 0;
524  // while( i < sample.size() ) {
525  // if( sample.at(i).name_size() == 0 ) {
526  // sample.remove( i );
527  // ++r;
528  // } else {
529  // ++i;
530  // }
531  // }
532  // return r;
533 }
534 
535 // =================================================================================================
536 // Joining and Merging
537 // =================================================================================================
538 
539 void copy_pqueries( Sample const& source, Sample& target )
540 {
541  // Check for identical topology, taxa names and edge_nums.
542  // We do not check here for branch_length, because usually those differ slightly.
543  if( ! compatible_trees( source.tree(), target.tree() )) {
544  throw std::runtime_error("Cannot join Samples, because their PlacementTrees differ.");
545  }
546 
547  // We need to assign edge pointers to the correct edge objects, so we need a mapping.
548  auto en_map = edge_num_to_edge_map( target.tree() );
549 
550  // Copy all (o)ther pqueries to (n)ew pqueries.
551  for( const auto& opqry : source.pqueries() ) {
552  auto npqry = Pquery();
553  for( auto opit = opqry.begin_placements(); opit != opqry.end_placements(); ++opit ) {
554 
555  // Assuming that the trees have identical topology (checked at the beginning of this
556  // function), there will be an edge for every placement. if this assertion fails,
557  // something broke the integrity of our in memory representation of the data.
558  assert( en_map.count( opit->edge_num() ) > 0 );
559 
560  npqry.add_placement( *en_map[ opit->edge_num() ], *opit );
561  }
562  for( auto name_it = opqry.begin_names(); name_it != opqry.end_names(); ++name_it ) {
563  npqry.add_name( *name_it );
564  }
565  target.add( npqry );
566  }
567 }
568 
570 {
572  merge_duplicate_names (smp);
574 }
575 
577 {
578  // We are looking for the transitive closure of all Pqueries that pairwise share a common name.
579  // In a graph theory setting, this could be depicted as follows:
580  // Each Pquery is a node, and it has an edge to other nodes iff they share a common name.
581  // For this, there exist algorithms like Floyd–Warshall. However, they are an overkill for the
582  // problem at hand, in terms of the complexity of both code and runtime requirements.
583  // From the biological data and the way Evolutionary Placement works, we do not expect to have
584  // many edges in this graph. Thus, this function uses a simpler solution: repeated search.
585  // Worst case, this needs as many iterations over all Pqueries as the longest chain of shared
586  // names. Pqueries with shared names like (a,b) (b,c) (c,d) (d,e) might need four iterations,
587  // depending on the order of those Pqueries. This is acceptable, as this case should be rare.
588 
589  // Iterate as long as needed to merge all transitive connections between Pqueries that share
590  // names. We need as many iterations as the longest chain of connected Pqueries.
591  bool need_iteration = true;
592  while (need_iteration) {
593  need_iteration = false;
594 
595  // A hash smp that contains the already processed names and links them to their pquery.
596  // Whenever the same name is found again, the associated placements will be copied to the
597  // stored Pquery and the then surplus Pquery will be deleted.
598  std::unordered_map<std::string, Pquery*> hash;
599 
600  // This is a list of the Pquery indices that we want to delete, because their placements
601  // have been moved to other Pqueries (those, which contain the same name). We need this
602  // list as deleting from a list while iterating it is not a good idea.
603  std::vector<size_t> del;
604 
605  for( size_t i = 0; i < smp.size(); ++i ) {
606  auto& pqry = smp.at(i);
607 
608  // Collect the Pqueries that can be merged with the current one, because they share
609  // a common name.
610  std::unordered_set<Pquery*> merges;
611  for( auto name_it = pqry.begin_names(); name_it != pqry.end_names(); ++name_it ) {
612  auto& name = *name_it;
613 
614  if (hash.count(name.name) > 0) {
615  merges.insert(hash[name.name]);
616  }
617  }
618 
619  if (merges.size() == 0) {
620  // All names are new, so store them in the hash smp for later.
621  // We don't need to do any merging in this case.
622  for( auto name_it = pqry.begin_names(); name_it != pqry.end_names(); ++name_it ) {
623  auto& name = *name_it;
624 
625  // We are here because we found no pquery to merge with. This means that the
626  // name cannot be in the hash smp already.
627  assert( hash.count(name.name) == 0 );
628 
629  // Now add it.
630  hash[name.name] = &pqry;
631  }
632  } else {
633  // We need merging. We will merge with only one Pquery in this iteration. If there
634  // are more than one Pqueries that we need to merge with (i.e., merges.size() > 1),
635  // we will mark that we need another iteration and do the other mergings then.
636 
637  // Get the Pquery that we will merge into.
638  Pquery* merge_into = *merges.begin();
639 
640  // TODO we could use move here instead of copy creating.
641 
642  // Add all placements to it.
643  for( auto const& place : pqry.placements() ) {
644  merge_into->add_placement( place );
645  }
646 
647  // Add all names. This will cause doubled names, but they can be reduced later
648  // via merge_duplicate_names().
649  // We could do the check here, but this would increase complexity and gain just a
650  // bit of speed (probably).
651  for( auto const& name : pqry.names() ) {
652  // Add the name to the Pquery.
653  merge_into->add_name( name );
654 
655  // Also add it to the hash smp. If this was name that is responsible for the
656  // merging (the shared name), we will replace the entry by an indentical copy.
657  // For all other names of the current Pquery, we need this to ensure that the
658  // next Pqueries in the loop will find it.
659  hash[name.name] = merge_into;
660  }
661 
662  // Mark the Pquery for deletion and delete its content
663  // (this is both to save memory, but also for some assertions later).
664  del.push_back(i);
665  pqry.clear();
666 
667  // Check whether we need to merge with more than one Pquery, meaning that we have
668  // transitive connections. This means, we need another iteration to resolve this.
669  if (merges.size() > 1) {
670  need_iteration = true;
671  }
672  }
673  }
674 
675  // Delete all Pqueries that were merged to others during this iteration.
676  // We need to do this in reverse order so that the indeces are not messed up while deleting.
677  for( auto it = del.rbegin(); it != del.rend(); ++it ) {
678  // Assert that this is an empty pquery. We cleared the ones that are marked for
679  // deletion, so in case that it is not empty, we are about to delete the wrong one!
680  assert( smp.at( *it ).placement_size() == 0 );
681  assert( smp.at( *it ).name_size() == 0 );
682 
683  smp.remove( *it );
684  }
685  }
686 }
687 
689 {
690  // Merge the placements into this map, indexed by the edge index they belong to.
691  // Also, store how many placements have been merged into this one.
692  std::unordered_map< size_t, std::pair< PqueryPlacement, size_t >> merge_units;
693 
694  for( auto pit = pquery.begin_placements(); pit != pquery.end_placements(); ++pit ) {
695  auto const& place = *pit;
696  auto edge_idx = place.edge().index();
697 
698  // For the first placement on each edge, make a copy.
699  if( merge_units.count( edge_idx ) == 0 ) {
700  merge_units[ edge_idx ] = { place, 0 };
701 
702  // For all others, add their values to the stored one.
703  } else {
704  auto& merge_into = merge_units[ edge_idx ].first;
705  ++merge_units[ edge_idx ].second;
706 
707  merge_into.likelihood += place.likelihood;
708  merge_into.like_weight_ratio += place.like_weight_ratio;
709  merge_into.proximal_length += place.proximal_length;
710  merge_into.pendant_length += place.pendant_length;
711  }
712  }
713 
714  // Clear all previous placements and add back the averaged merged ones.
715  pquery.clear_placements();
716  for( auto& merge_unit : merge_units ) {
717  auto& place = merge_unit.second.first;
718 
719  if( merge_unit.second.second > 1 ) {
720  double denom = static_cast<double>( merge_unit.second.second );
721 
722  place.likelihood /= denom;
723  place.like_weight_ratio /= denom;
724  place.proximal_length /= denom;
725  place.pendant_length /= denom;
726  }
727 
728  pquery.add_placement(place);
729  }
730 }
731 
733 {
734  for( auto pquery_it = smp.begin(); pquery_it != smp.end(); ++pquery_it ) {
735  merge_duplicate_placements( *pquery_it );
736  }
737 }
738 
740 {
741  // We add names to this map, indexed by their actual name string.
742  std::unordered_map<std::string, PqueryName> result;
743  for( auto name_it = pquery.begin_names(); name_it != pquery.end_names(); ++name_it ) {
744  auto const& name = *name_it;
745 
746  if( result.count( name.name ) == 0 ) {
747  result[ name.name ] = name;
748  } else {
749  result[ name.name ].multiplicity += name.multiplicity;
750  }
751  }
752 
753  // Now delete all names and re-populate using the map.
754  pquery.clear_names();
755  for( auto const& n : result ) {
756  pquery.add_name( n.second );
757  }
758 }
759 
761 {
762  for( auto pquery_it = smp.begin(); pquery_it != smp.end(); ++pquery_it ) {
763  merge_duplicate_names( *pquery_it );
764  }
765 }
766 
767 // =================================================================================================
768 // Placement Mass
769 // =================================================================================================
770 
771 size_t total_name_count( Sample const& smp )
772 {
773  size_t count = 0;
774  for( auto const& pqry : smp.pqueries() ) {
775  count += pqry.name_size();
776  }
777  return count;
778 }
779 
780 size_t total_placement_count( Sample const& smp )
781 {
782  size_t count = 0;
783  for( auto const& pqry : smp.pqueries() ) {
784  count += pqry.placement_size();
785  }
786  return count;
787 }
788 
789 std::pair<PlacementTreeEdge const*, size_t> placement_count_max_edge( Sample const& smp )
790 {
791  PlacementTreeEdge const* edge = nullptr;
792  size_t max = 0;
793 
794  auto place_map = placements_per_edge( smp );
795 
796  for( size_t edge_i = 0; edge_i < place_map.size(); ++edge_i ) {
797  if( place_map[ edge_i ].size() > max ) {
798  edge = &smp.tree().edge_at( edge_i );
799  max = place_map[ edge_i ].size();
800  }
801  }
802 
803  return std::make_pair(edge, max);
804 }
805 
806 std::pair<PlacementTreeEdge const*, double> placement_mass_max_edge( Sample const& smp )
807 {
808  PlacementTreeEdge const* edge = nullptr;
809  double max = 0.0;
810 
811  auto place_map = placements_per_edge( smp );
812 
813  for( size_t edge_i = 0; edge_i < place_map.size(); ++edge_i ) {
814  double sum = 0.0;
815  for( auto const& place : place_map[ edge_i ]) {
816  sum += place->like_weight_ratio;
817  }
818  if (sum > max) {
819  edge = &smp.tree().edge_at( edge_i );
820  max = sum;
821  }
822  }
823  return std::make_pair(edge, max);
824 }
825 
826 // =================================================================================================
827 // Histograms
828 // =================================================================================================
829 
830 std::vector<double> closest_leaf_weight_distribution( Sample const& sample )
831 {
832  std::vector<double> distrib;
833 
834  // Get a vector telling us the depth from each node to its closest leaf node.
835  auto depths = tree::closest_leaf_depth_vector( sample.tree() );
836 
837  for( auto const& pquery : sample.pqueries() ) {
838  for( auto& place : pquery.placements() ) {
839  // Try both nodes at the end of the placement's edge and see which one is closer
840  // to a leaf.
841  auto dp = depths[ place.edge().primary_node().index() ].second;
842  auto ds = depths[ place.edge().secondary_node().index() ].second;
843  auto ld = std::min(dp, ds);
844 
845  // Put the closer one into the histogram, resize if necessary.
846  if( distrib.size() < ld + 1 ) {
847  distrib.resize( ld + 1, 0.0 );
848  }
849  distrib[ld] += place.like_weight_ratio;
850  }
851  }
852 
853  return distrib;
854 }
855 
856 // TODO use the histogram class of utils here!
857 
858 std::vector<int> closest_leaf_depth_histogram( Sample const& smp )
859 {
860  std::vector<int> hist;
861 
862  // Get a vector telling us the depth from each node to its closest leaf node.
863  auto depths = tree::closest_leaf_depth_vector( smp.tree() );
864 
865  for( auto const& pquery : smp.pqueries() ) {
866  for( auto& place : pquery.placements() ) {
867  // Try both nodes at the end of the placement's edge and see which one is closer
868  // to a leaf.
869  auto dp = depths[ place.edge().primary_node().index() ].second;
870  auto ds = depths[ place.edge().secondary_node().index() ].second;
871  auto ld = std::min(dp, ds);
872 
873  // Put the closer one into the histogram, resize if necessary.
874  if (ld + 1 > hist.size()) {
875  hist.resize(ld + 1, 0);
876  }
877  ++hist[ld];
878  }
879  }
880 
881  return hist;
882 }
883 
885  Sample const& smp, const double min, const double max, const int bins
886 ) {
887  std::vector<int> hist(bins, 0);
888  double bin_size = (max - min) / bins;
889 
890  // get a vector telling us the distance from each node to its closest leaf node.
891  auto dists = tree::closest_leaf_distance_vector( smp.tree() );
892 
893  for (const auto& pquery : smp.pqueries()) {
894  for( auto& place : pquery.placements() ) {
895  // try both nodes at the end of the placement's edge and see which one is closer
896  // to a leaf.
897  double dp = place.pendant_length
898  + place.proximal_length
899  + dists[ place.edge().primary_node().index() ].second;
900  double ds = place.pendant_length
901  + place.edge().data<PlacementEdgeData>().branch_length
902  - place.proximal_length
903  + dists[ place.edge().secondary_node().index() ].second;
904  double ld = std::min(dp, ds);
905 
906  // find the right bin. if the distance value is outside the boundaries of [min;max],
907  // place it in the first or last bin, respectively.
908  int bin = static_cast <int> (std::floor( (ld - min) / bin_size ));
909  if (bin < 0) {
910  bin = 0;
911  }
912  if (bin >= bins) {
913  bin = bins - 1;
914  }
915  ++hist[bin];
916  }
917  }
918 
919  return hist;
920 }
921 
923  Sample const& smp, double& min, double& max, const int bins
924 ) {
925  std::vector<int> hist(bins, 0);
926 
927  // we do not know yet where the boundaries of the histogram lie, so we need to store all values
928  // first and find their min and max.
929  std::vector<double> distrib;
930  double min_d = 0.0;
931  double max_d = 0.0;
932 
933  // get a vector telling us the distance from each node to its closest leaf node.
934  auto dists = tree::closest_leaf_distance_vector( smp.tree() );
935 
936  // calculate all distances from placements to their closest leaf and store them.
937  for (const auto& pquery : smp.pqueries()) {
938  for( auto& place : pquery.placements() ) {
939  // try both nodes at the end of the placement's edge and see which one is closer
940  // to a leaf.
941  double dp = place.pendant_length
942  + place.proximal_length
943  + dists[ place.edge().primary_node().index() ].second;
944  double ds = place.pendant_length
945  + place.edge().data<PlacementEdgeData>().branch_length
946  - place.proximal_length
947  + dists[ place.edge().secondary_node().index() ].second;
948  double ld = std::min(dp, ds);
949  distrib.push_back(ld);
950 
951  // update min and max as needed (and in first iteration). we use std::nextafter() for
952  // the max in order to make sure that the max value is actually placed in the last bin.
953  if (distrib.size() == 1 || ld < min_d) {
954  min_d = ld;
955  }
956  if (distrib.size() == 1 || ld > max_d) {
957  max_d = std::nextafter(ld, ld + 1);
958  }
959  }
960  }
961 
962  // now we know min and max of the distances, so we can calculate the histogram.
963  double bin_size = (max_d - min_d) / bins;
964  for (double ld : distrib) {
965  int bin = static_cast <int> (std::floor( (ld - min_d) / bin_size ));
966  assert(bin >= 0 && bin < bins);
967  ++hist[bin];
968  }
969 
970  // report the min and max values to the calling function and return the histogram.
971  min = min_d;
972  max = max_d;
973  return hist;
974 }
975 
976 } // namespace placement
977 } // namespace genesis
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::placement::Pquery::clear_placements
void clear_placements()
Delete all PqueryPlacements of this Pquery.
Definition: pquery.cpp:133
genesis::placement::Pquery::name_size
size_t name_size() const
Return the number of PqueryNames stored in this Pquery.
Definition: pquery.cpp:193
genesis::placement::filter_pqueries_keeping_names
void filter_pqueries_keeping_names(Sample &smp, std::string const &regex, bool remove_empty_pqueries)
Remove all PqueryNames which do not match the given regex.
Definition: placement/function/functions.cpp:418
functions.hpp
CommonTree functions.
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::placement::copy_pqueries
void copy_pqueries(Sample const &source, Sample &target)
Copy all Pqueries from the source Sample (left parameter) to the target Sample (right parameter).
Definition: placement/function/functions.cpp:539
genesis::placement::filter_pqueries_removing_names
void filter_pqueries_removing_names(Sample &smp, std::string const &regex, bool remove_empty_pqueries)
Remove all PqueryNames which match the given regex.
Definition: placement/function/functions.cpp:439
genesis::placement::Sample::size
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:138
genesis::placement::Sample::end
iterator_pqueries end()
Return an iterator to the end of the Pqueries of this Sample.
Definition: sample.cpp:254
genesis::placement::placement_count_max_edge
std::pair< PlacementTreeEdge const *, size_t > placement_count_max_edge(Sample const &smp)
Get the number of placements on the edge with the most placements, and a pointer to this edge.
Definition: placement/function/functions.cpp:789
genesis::placement::Pquery::begin_names
iterator_names begin_names()
Definition: pquery.cpp:142
genesis::placement::total_name_count
size_t total_name_count(Sample const &smp)
Get the total number of PqueryNames in all Pqueries of the given Sample.
Definition: placement/function/functions.cpp:771
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
genesis::placement::Pquery::name_at
PqueryName & name_at(size_t index)
Return the PqueryName at a certain index.
Definition: pquery.cpp:198
genesis::placement::find_pquery
Pquery const * find_pquery(Sample const &smp, std::string const &name)
Return the first Pquery that has a particular name, or nullptr of none has.
Definition: placement/function/functions.cpp:84
functions.hpp
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
genesis::placement::Pquery::begin_placements
iterator_placements begin_placements()
Definition: pquery.cpp:53
genesis::placement::filter_min_weight_threshold
void filter_min_weight_threshold(Pquery &pquery, double threshold)
Remove all PqueryPlacements that have a like_weight_ratio below the given threshold.
Definition: placement/function/functions.cpp:275
genesis::placement::Sample::add
Pquery & add()
Create an empty Pquery, add it to the Sample and return it.
Definition: sample.cpp:152
std.hpp
Provides some valuable additions to STD.
genesis::placement::has_name
bool has_name(Pquery const &pquery, std::string const &name)
Return true iff the given Pquery contains a particular name.
Definition: placement/function/functions.cpp:64
functions.hpp
Provides functions for working with Placements and Pqueries.
genesis::placement::PqueryName::name
std::string name
Name for a Pquery.
Definition: name.hpp:118
genesis::placement::placement_mass_max_edge
std::pair< PlacementTreeEdge const *, double > placement_mass_max_edge(Sample const &smp)
Get the summed mass of the placements on the heaviest edge, measured by their like_weight_ratio,...
Definition: placement/function/functions.cpp:806
genesis::placement::filter_max_pendant_length
void filter_max_pendant_length(Pquery &pquery, double threshold)
Remove all PqueryPlacements that have a pendant_length above the given threshold.
Definition: placement/function/functions.cpp:318
genesis::placement::filter_pqueries_differing_names
void filter_pqueries_differing_names(Sample &sample_1, Sample &sample_2, bool remove_empty_pqueries)
Remove all PqueryNames from the two Samples that occur in both of them.
Definition: placement/function/functions.cpp:474
distances.hpp
Header of CommonTree distance methods.
genesis::placement::PlacementEdgeData
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num,...
Definition: placement_tree.hpp:139
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::placement::Pquery::placement_at
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:118
genesis::placement::edge_num_to_edge_map
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
Definition: placement/function/helper.cpp:55
genesis::placement::PqueryPlacement::like_weight_ratio
double like_weight_ratio
Likelihood weight ratio of this placement.
Definition: placement/pquery/placement.hpp:137
genesis::placement::Sample::begin
iterator_pqueries begin()
Return an iterator to the beginning of the Pqueries of this Sample.
Definition: sample.cpp:244
genesis::placement::closest_leaf_distance_histogram_auto
std::vector< int > closest_leaf_distance_histogram_auto(Sample const &smp, double &min, double &max, const int bins)
Returns the same type of histogram as closest_leaf_distance_histogram(), but automatically determines...
Definition: placement/function/functions.cpp:922
genesis::placement::sort_placements_by_weight
void sort_placements_by_weight(Pquery &pquery)
Sort the PqueryPlacements of a Pquery by their like_weight_ratio, in descending order (most likely fi...
Definition: placement/function/functions.cpp:147
genesis::placement::Pquery
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
genesis::utils::erase_if
void erase_if(Container &c, UnaryPredicate p)
Erases all elements from the container that satisfy a given predicate. An element is erased,...
Definition: algorithm.hpp:107
genesis::placement::Pquery::add_name
PqueryName & add_name(std::string name="", double multiplicity=1.0)
Create a new PqueryName using the provided parameters, add it to the Pquery and return it.
Definition: pquery.cpp:181
genesis::placement::filter_pqueries_intersecting_names
void filter_pqueries_intersecting_names(Sample &sample_1, Sample &sample_2, bool remove_empty_pqueries)
Remove all PqueryNames from the two Samples that are unique to each of them.
Definition: placement/function/functions.cpp:460
genesis::placement::Pquery::end_names
iterator_names end_names()
Definition: pquery.cpp:147
genesis::placement::scale_all_branch_lengths
void scale_all_branch_lengths(Sample &smp, double factor)
Scale all branch lengths of the Tree and the position of the PqueryPlacements by a given factor.
Definition: placement/function/functions.cpp:168
genesis::placement::Sample::at
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:190
genesis::placement::merge_duplicate_placements
void merge_duplicate_placements(Pquery &pquery)
Merge all PqueryPlacements of a Pquery that are on the same TreeEdge into one averaged PqueryPlacemen...
Definition: placement/function/functions.cpp:688
genesis::placement::all_pquery_names
std::unordered_set< std::string > all_pquery_names(Sample const &sample)
Return a set of all unique PqueryNames of the Pqueries of the given sample.
Definition: placement/function/functions.cpp:108
genesis::tree::TreeEdge
Definition: edge.hpp:60
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
operators.hpp
Tree operator functions.
genesis::placement::Pquery::placements
utils::Range< iterator_placements > placements()
Return a Range iterator to the PqueryPlacements.
Definition: pquery.cpp:73
genesis::placement::Pquery::placement_size
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:113
genesis::placement::merge_duplicates
void merge_duplicates(Sample &smp)
Look for Pqueries with the same name and merge them.
Definition: placement/function/functions.cpp:569
operators.hpp
genesis::placement::normalize_weight_ratios
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1....
Definition: placement/function/functions.cpp:123
genesis::placement::Pquery::clear_names
void clear_names()
Delete all PqueryNames of this Pquery.
Definition: pquery.cpp:213
genesis::placement::remove_empty_name_pqueries
size_t remove_empty_name_pqueries(Sample &sample)
Remove all Pqueries from the Sample that have no PqueryNames.
Definition: placement/function/functions.cpp:506
genesis::placement::filter_pqueries_by_name_
void filter_pqueries_by_name_(Sample &smp, F predicate, bool remove_empty_pqueries)
Definition: placement/function/functions.cpp:396
genesis::placement::closest_leaf_depth_histogram
std::vector< int > closest_leaf_depth_histogram(Sample const &smp)
Return a histogram representing how many placements have which depth with respect to their closest le...
Definition: placement/function/functions.cpp:858
genesis::placement::closest_leaf_weight_distribution
std::vector< double > closest_leaf_weight_distribution(Sample const &sample)
Definition: placement/function/functions.cpp:830
genesis::placement::PqueryName
A name of a Pquery and its multiplicity.
Definition: name.hpp:55
genesis::placement::Pquery::end_placements
iterator_placements end_placements()
Definition: pquery.cpp:58
genesis::placement::collect_duplicate_pqueries
void collect_duplicate_pqueries(Sample &smp)
Find all Pqueries that share a common name and combine them into a single Pquery containing all thei...
Definition: placement/function/functions.cpp:576
genesis::tree::CommonEdgeData
Common class containing the commonly needed data for tree edges.
Definition: tree/common_tree/tree.hpp:144
genesis::placement::merge_duplicate_names
void merge_duplicate_names(Pquery &pquery)
Merge all PqueryNames that have the same name property into one, while adding up their multiplicity.
Definition: placement/function/functions.cpp:739
genesis::placement::PqueryPlacement
One placement position of a Pquery on a Tree.
Definition: placement/pquery/placement.hpp:75
genesis::placement::placements_per_edge
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge.
Definition: placement/function/helper.cpp:106
genesis::placement::total_placement_count
size_t total_placement_count(Sample const &smp)
Get the total number of PqueryPlacements in all Pqueries of the given Sample.
Definition: placement/function/functions.cpp:780
genesis::placement::Pquery::add_placement
PqueryPlacement & add_placement(PlacementTreeEdge &edge)
Create a new PqueryPlacement at a given PlacementTreeEdge, add it to the Pquery and return it.
Definition: pquery.cpp:92
genesis::placement::Sample::remove
void remove(size_t index)
Remove the Pquery at a given index from the Sample.
Definition: sample.cpp:204
genesis::tree::TreeEdge::data
EdgeDataType & data()
Definition: edge.hpp:217
genesis::placement::Pquery::expose_placements
std::vector< PqueryPlacement > & expose_placements()
Definition: pquery.cpp:83
distances.hpp
Header of Tree distance methods.
genesis::placement::Sample::pqueries
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:264
genesis::placement::filter_min_accumulated_weight
void filter_min_accumulated_weight(Pquery &pquery, double threshold)
Remove the PqueryPlacements with the lowest like_weight_ratio, while keeping the accumulated weight (...
Definition: placement/function/functions.cpp:212
genesis::tree::closest_leaf_distance_vector
std::vector< std::pair< TreeNode const *, double > > closest_leaf_distance_vector(Tree const &tree)
Return a vector containing the closest leaf node for each node, using the branch_length as distance m...
Definition: tree/common_tree/distances.cpp:345
genesis::placement::adjust_branch_lengths
void adjust_branch_lengths(Sample &sample, tree::Tree const &source)
Take the branch lengths of the source Tree and use them as the new branch lengths of the sample.
Definition: placement/function/functions.cpp:178
genesis::placement::closest_leaf_distance_histogram
std::vector< int > closest_leaf_distance_histogram(Sample const &smp, const double min, const double max, const int bins)
Returns a histogram counting the number of placements that have a certain distance to their closest l...
Definition: placement/function/functions.cpp:884
genesis::placement::filter_n_max_weight_placements
void filter_n_max_weight_placements(Pquery &pquery, size_t n)
Remove all PqueryPlacements but the n most likely ones from the Pquery.
Definition: placement/function/functions.cpp:246
genesis::tree::scale_all_branch_lengths
void scale_all_branch_lengths(Tree &tree, double factor)
Scale all branch lengths of a Tree by a given factor.
Definition: tree/common_tree/functions.cpp:233
genesis::placement::compatible_trees
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
Definition: placement/function/operators.cpp:63
genesis::placement::remove_empty_placement_pqueries
size_t remove_empty_placement_pqueries(Sample &sample)
Remove all Pqueries from the Sample that have no PqueryPlacements.
Definition: placement/function/functions.cpp:333
genesis::tree::closest_leaf_depth_vector
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
Definition: tree/function/distances.cpp:251
helper.hpp
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
genesis::placement::filter_min_pendant_length
void filter_min_pendant_length(Pquery &pquery, double threshold)
Remove all PqueryPlacements that have a pendant_length below the given threshold.
Definition: placement/function/functions.cpp:303
genesis::placement::PqueryPlacement::pendant_length
double pendant_length
Length of the attached branch of this placement.
Definition: placement/pquery/placement.hpp:165