A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
42 
43 #include <algorithm>
44 #include <cassert>
45 #include <cmath>
46 #include <exception>
47 #include <regex>
48 #include <string>
49 #include <unordered_map>
50 #include <unordered_set>
51 #include <vector>
52 
53 #ifdef GENESIS_OPENMP
54 # include <omp.h>
55 #endif
56 
57 namespace genesis {
58 namespace placement {
59 
60 // =================================================================================================
61 // Pquery Names
62 // =================================================================================================
63 
64 bool has_name( Pquery const& pquery, std::string const& name )
65 {
66  for( size_t i = 0; i < pquery.name_size(); ++i ) {
67  if( pquery.name_at(i).name == name ) {
68  return true;
69  }
70  }
71  return false;
72 }
73 
74 bool has_name( Sample const& smp, std::string const& name )
75 {
76  for( auto const& pqry : smp.pqueries() ) {
77  if( has_name( pqry, name ) ) {
78  return true;
79  }
80  }
81  return false;
82 }
83 
84 Pquery const* find_pquery( Sample const& smp, std::string const& name )
85 {
86  // TODO instead of pointer, return an iterator!
87  // then use find if directly!
88  for( auto const& pqry : smp.pqueries() ) {
89  if( has_name( pqry, name )) {
90  return &pqry;
91  }
92  }
93  return nullptr;
94 }
95 
96 Pquery* find_pquery( Sample& smp, std::string const& name )
97 {
98  // TODO instead of pointer, return an iterator!
99  // then use find if directly!
100  for( auto& pqry : smp.pqueries() ) {
101  if( has_name( pqry, name )) {
102  return &pqry;
103  }
104  }
105  return nullptr;
106 }
107 
108 std::unordered_set<std::string> all_pquery_names( Sample const& sample )
109 {
110  std::unordered_set<std::string> result;
111  for( auto const& pquery : sample ) {
112  for( auto const& pname : pquery.names() ) {
113  result.insert( pname.name );
114  }
115  }
116  return result;
117 }
118 
119 // =================================================================================================
120 // Normalization and Sorting
121 // =================================================================================================
122 
124 {
125  double sum = 0.0;
126  for( auto const& place : pquery.placements() ) {
127  sum += place.like_weight_ratio;
128  }
129  if( sum == 0.0 ) {
130  throw std::overflow_error( "Cannot normalize weight ratios if all of them are zero." );
131  }
132  for( auto& place : pquery.placements() ) {
133  place.like_weight_ratio /= sum;
134  }
135 }
136 
138 {
139  for( auto pqry_it = smp.begin(); pqry_it != smp.end(); ++pqry_it ) {
140  normalize_weight_ratios( *pqry_it );
141  }
142 }
143 
144 // void sort_placements_by_proximal_length( PlacementTreeEdge& edge );
145 // void sort_placements_by_proximal_length( Sample& smp );
146 
148 {
149  std::sort(
150  pquery.begin_placements(),
151  pquery.end_placements(),
152  [] (
153  PqueryPlacement const& lhs,
154  PqueryPlacement const& rhs
155  ) {
156  return lhs.like_weight_ratio > rhs.like_weight_ratio;
157  }
158  );
159 }
160 
162 {
163  for( auto pqry_it = smp.begin(); pqry_it != smp.end(); ++pqry_it ) {
164  sort_placements_by_weight( *pqry_it );
165  }
166 }
167 
168 void scale_all_branch_lengths( Sample& smp, double factor )
169 {
170  tree::scale_all_branch_lengths( smp.tree(), factor );
171  for( auto& pqry : smp.pqueries() ) {
172  for( auto& place : pqry.placements() ) {
173  place.proximal_length *= factor;
174  }
175  }
176 }
177 
178 void adjust_branch_lengths( Sample& sample, tree::Tree const& source )
179 {
180  // Basic error check.
181  if( sample.tree().edge_count() != source.edge_count() ) {
182  throw std::runtime_error( "Sizes of the Sample::tree() and source tree differ." );
183  }
184 
185  // First, adjust the placement positions,
186  // because we need the old branch lengths of the sample for correct scaling.
187  #pragma omp parallel for
188  for( size_t i = 0; i < sample.size(); ++i ) {
189  auto& pquery = sample.at(i);
190 
191  for( auto& placement : pquery.placements() ) {
192  auto const edge_idx = placement.edge().index();
193  auto const old_bl = placement.edge().data<PlacementEdgeData>().branch_length;
194  auto const new_bl = source.edge_at(edge_idx).data<tree::DefaultEdgeData>().branch_length;
195 
196  placement.proximal_length *= new_bl / old_bl;
197  }
198  }
199 
200  // Now adjust the reference tree branch lengths.
201  #pragma omp parallel for
202  for( size_t edge_idx = 0; edge_idx < source.edge_count(); ++edge_idx ) {
203  auto const new_bl = source.edge_at(edge_idx).data<tree::DefaultEdgeData>().branch_length;
204  sample.tree().edge_at(edge_idx).data<PlacementEdgeData>().branch_length = new_bl;
205  }
206 }
207 
208 // =================================================================================================
209 // Filtering
210 // =================================================================================================
211 
212 void filter_min_accumulated_weight( Pquery& pquery, double threshold )
213 {
214  // Sort, so that the most likely placements are in the beginning.
215  sort_placements_by_weight( pquery );
216 
217  // Find the position where enough weight is accumulated.
218  size_t i = 0 ;
219  double weight_sum = 0.0;
220  do {
221  weight_sum += pquery.placement_at(i).like_weight_ratio;
222  ++i;
223  } while( weight_sum < threshold && i < pquery.placement_size() );
224  assert( i > 0 );
225 
226  // Remove the rest.
227  if( i < pquery.placement_size() ) {
228  auto& placements = pquery.expose_placements();
229  placements.erase( placements.begin() + i, placements.end() );
230  }
231  assert( pquery.placement_size() == i );
232 
233  // Slow old version.
234  // while( pquery.placement_size() > i ) {
235  // pquery.remove_placement_at( pquery.placement_size() - 1 );
236  // }
237 }
238 
239 void filter_min_accumulated_weight( Sample& smp, double threshold )
240 {
241  for( auto& pquery : smp ) {
242  filter_min_accumulated_weight( pquery, threshold );
243  }
244 }
245 
246 void filter_n_max_weight_placements( Pquery& pquery, size_t n )
247 {
248  // Check if there is anything to do at all.
249  if( pquery.placement_size() == 0 || pquery.placement_size() <= n ) {
250  return;
251  }
252 
253  // Sort.
254  sort_placements_by_weight( pquery );
255 
256  // Remove the rest.
257  assert( pquery.placement_size() > n );
258  auto& placements = pquery.expose_placements();
259  placements.erase( placements.begin() + n, placements.end() );
260  assert( pquery.placement_size() == n );
261 
262  // Slow old version.
263  // while( pquery.placement_size() > n ) {
264  // pquery.remove_placement_at( pquery.placement_size() - 1 );
265  // }
266 }
267 
269 {
270  for( auto& pquery : smp ) {
271  filter_n_max_weight_placements( pquery, n );
272  }
273 }
274 
275 void filter_min_weight_threshold( Pquery& pquery, double threshold )
276 {
277  auto& placements = pquery.expose_placements();
278  utils::erase_if( placements, [&]( PqueryPlacement const& placement ){
279  return placement.like_weight_ratio < threshold;
280  });
281 
282  // Slow old version.
283  // This is certainly not the cleanest solution, as it might cause quite some relocations.
284  // However, we'd need full write access to the placement vector, which is currently not
285  // supported for reasons of encapsulation...
286  // size_t i = 0;
287  // while( i < pquery.placement_size() ) {
288  // if( pquery.placement_at(i).like_weight_ratio < threshold ) {
289  // pquery.remove_placement_at(i);
290  // } else {
291  // ++i;
292  // }
293  // }
294 }
295 
296 void filter_min_weight_threshold( Sample& smp, double threshold )
297 {
298  for( auto& pquery : smp ) {
299  filter_min_weight_threshold( pquery, threshold );
300  }
301 }
302 
303 void filter_pqueries_keeping_names( Sample& smp, std::string const& regex )
304 {
305  std::regex pattern( regex );
306  auto new_past_the_end = std::remove_if(
307  smp.begin(),
308  smp.end(),
309  [&] ( Pquery const& pqry ) {
310  for( auto const& nm : pqry.names() ) {
311  if( regex_match( nm.name, pattern ) ) {
312  return false;
313  }
314  }
315  return true;
316  }
317  );
318  smp.remove( new_past_the_end, smp.end() );
319 }
320 
321 void filter_pqueries_keeping_names( Sample& smp, std::unordered_set<std::string> keep_list )
322 {
323  auto new_past_the_end = std::remove_if(
324  smp.begin(),
325  smp.end(),
326  [&] ( Pquery const& pqry ) {
327  for( auto const& nm : pqry.names() ) {
328  if( keep_list.count( nm.name ) > 0 ) {
329  return false;
330  }
331  }
332  return true;
333  }
334  );
335  smp.remove( new_past_the_end, smp.end() );
336 }
337 
338 void filter_pqueries_removing_names( Sample& smp, std::string const& regex )
339 {
340  std::regex pattern( regex );
341  auto new_past_the_end = std::remove_if(
342  smp.begin(),
343  smp.end(),
344  [&] ( Pquery const& pqry ) {
345  for( auto const& nm : pqry.names() ) {
346  if( regex_match( nm.name, pattern ) ) {
347  return true;
348  }
349  }
350  return false;
351  }
352  );
353  smp.remove( new_past_the_end, smp.end() );
354 }
355 
356 void filter_pqueries_removing_names( Sample& smp, std::unordered_set<std::string> remove_list )
357 {
358  auto new_past_the_end = std::remove_if(
359  smp.begin(),
360  smp.end(),
361  [&] ( Pquery const& pqry ) {
362  for( auto const& nm : pqry.names() ) {
363  if( remove_list.count( nm.name ) > 0 ) {
364  return true;
365  }
366  }
367  return false;
368  }
369  );
370  smp.remove( new_past_the_end, smp.end() );
371 }
372 
374 {
375  // Remove those pqueries from sample_2 which do not occur in sample_1.
376  auto names = all_pquery_names( sample_1 );
377  filter_pqueries_keeping_names( sample_2, names );
378 
379  // And vice versa (using the names of the already smaller sample_2).
380  names = all_pquery_names( sample_2 );
381  filter_pqueries_keeping_names( sample_1, names );
382 }
383 
384 void filter_pqueries_differing_names( Sample& sample_1, Sample& sample_2 )
385 {
386  // Yep, yep, this is wasteful. But it works for now.
387 
388  // Get all sorted names in sample 1 ...
389  auto names_1_u = all_pquery_names( sample_1 );
390  auto names_1 = std::vector< std::string >( names_1_u.begin(), names_1_u.end() );
391  std::sort( names_1.begin(), names_1.end() );
392  names_1_u.clear();
393 
394  // ... and in sample 2.
395  auto names_2_u = all_pquery_names( sample_2 );
396  auto names_2 = std::vector< std::string >( names_2_u.begin(), names_2_u.end() );
397  std::sort( names_2.begin(), names_2.end() );
398  names_2_u.clear();
399 
400  // Get their intersection.
401  std::unordered_set< std::string > symdiff;
402  std::set_intersection(
403  names_1.begin(), names_1.end(),
404  names_2.begin(), names_2.end(),
405  std::inserter( symdiff, symdiff.end() )
406  );
407 
408  // Remove all intersecting elements from the sampels.
409  filter_pqueries_removing_names( sample_1, symdiff );
410  filter_pqueries_removing_names( sample_2, symdiff );
411 }
412 
413 size_t remove_empty_pqueries( Sample& sample )
414 {
415  size_t i = 0;
416  size_t r = 0;
417  while( i < sample.size() ) {
418  if( sample.at(i).placement_size() == 0 ) {
419  sample.remove( i );
420  ++r;
421  } else {
422  ++i;
423  }
424  }
425  return r;
426 }
427 
428 // =================================================================================================
429 // Joining and Merging
430 // =================================================================================================
431 
432 void copy_pqueries( Sample const& source, Sample& target )
433 {
434  // Check for identical topology, taxa names and edge_nums.
435  // We do not check here for branch_length, because usually those differ slightly.
436  if( ! compatible_trees( source.tree(), target.tree() )) {
437  throw std::runtime_error("Cannot join Samples, because their PlacementTrees differ.");
438  }
439 
440  // We need to assign edge pointers to the correct edge objects, so we need a mapping.
441  auto en_map = edge_num_to_edge_map( target.tree() );
442 
443  // Copy all (o)ther pqueries to (n)ew pqueries.
444  for( const auto& opqry : source.pqueries() ) {
445  auto npqry = Pquery();
446  for( auto opit = opqry.begin_placements(); opit != opqry.end_placements(); ++opit ) {
447 
448  // Assuming that the trees have identical topology (checked at the beginning of this
449  // function), there will be an edge for every placement. if this assertion fails,
450  // something broke the integrity of our in memory representation of the data.
451  assert( en_map.count( opit->edge_num() ) > 0 );
452 
453  npqry.add_placement( *en_map[ opit->edge_num() ], *opit );
454  }
455  for( auto name_it = opqry.begin_names(); name_it != opqry.end_names(); ++name_it ) {
456  npqry.add_name( *name_it );
457  }
458  target.add( npqry );
459  }
460 }
461 
463 {
465  merge_duplicate_names (smp);
467 }
468 
470 {
471  // We are looking for the transitive closure of all Pqueries that pairwise share a common name.
472  // In a graph theory setting, this could be depicted as follows:
473  // Each Pquery is a node, and it has an edge to other nodes iff they share a common name.
474  // For this, there exist algorithms like Floyd–Warshall. However, they are an overkill for the
475  // problem at hand, in terms of the complexity of both code and runtime requirements.
476  // From the biological data and the way Evolutionary Placement works, we do not expect to have
477  // many edges in this graph. Thus, this function uses a simpler solution: repeated search.
478  // Worst case, this needs as many iterations over all Pqueries as the longest chain of shared
479  // names. Pqueries with shared names like (a,b) (b,c) (c,d) (d,e) might need four iterations,
480  // depending on the order of those Pqueries. This is acceptable, as this case should be rare.
481 
482  // Iterate as long as needed to merge all transitive connections between Pqueries that share
483  // names. We need as many iterations as the longest chain of connected Pqueries.
484  bool need_iteration = true;
485  while (need_iteration) {
486  need_iteration = false;
487 
488  // A hash smp that contains the already processed names and links them to their pquery.
489  // Whenever the same name is found again, the associated placements will be copied to the
490  // stored Pquery and the then surplus Pquery will be deleted.
491  std::unordered_map<std::string, Pquery*> hash;
492 
493  // This is a list of the Pquery indices that we want to delete, because their placements
494  // have been moved to other Pqueries (those, which contain the same name). We need this
495  // list as deleting from a list while iterating it is not a good idea.
496  std::vector<size_t> del;
497 
498  for( size_t i = 0; i < smp.size(); ++i ) {
499  auto& pqry = smp.at(i);
500 
501  // Collect the Pqueries that can be merged with the current one, because they share
502  // a common name.
503  std::unordered_set<Pquery*> merges;
504  for( auto name_it = pqry.begin_names(); name_it != pqry.end_names(); ++name_it ) {
505  auto& name = *name_it;
506 
507  if (hash.count(name.name) > 0) {
508  merges.insert(hash[name.name]);
509  }
510  }
511 
512  if (merges.size() == 0) {
513  // All names are new, so store them in the hash smp for later.
514  // We don't need to do any merging in this case.
515  for( auto name_it = pqry.begin_names(); name_it != pqry.end_names(); ++name_it ) {
516  auto& name = *name_it;
517 
518  // We are here because we found no pquery to merge with. This means that the
519  // name cannot be in the hash smp already.
520  assert(hash.count(name.name) == 0);
521 
522  // Now add it.
523  hash[name.name] = &pqry;
524  }
525  } else {
526  // We need merging. We will merge with only one Pquery in this iteration. If there
527  // are more than one Pqueries that we need to merge with (i.e., merges.size() > 1),
528  // we will mark that we need another iteration and do the other mergings then.
529 
530  // Get the Pquery that we will merge into.
531  Pquery* merge_into = *merges.begin();
532 
533  // TODO we could use move here instead of copy creating.
534 
535  // Add all placements to it.
536  for( auto const& place : pqry.placements() ) {
537  merge_into->add_placement( place );
538  }
539 
540  // Add all names. This will cause doubled names, but they can be reduced later
541  // via merge_duplicate_names().
542  // We could do the check here, but this would increase complexity and gain just a
543  // bit of speed (probably).
544  for( auto const& name : pqry.names() ) {
545  // Add the name to the Pquery.
546  merge_into->add_name( name );
547 
548  // Also add it to the hash smp. If this was name that is responsible for the
549  // merging (the shared name), we will replace the entry by an indentical copy.
550  // For all other names of the current Pquery, we need this to ensure that the
551  // next Pqueries in the loop will find it.
552  hash[name.name] = merge_into;
553  }
554 
555  // Mark the Pquery for deletion and delete its content
556  // (this is both to save memory, but also for some assertions later).
557  del.push_back(i);
558  pqry.clear();
559 
560  // Check whether we need to merge with more than one Pquery, meaning that we have
561  // transitive connections. This means, we need another iteration to resolve this.
562  if (merges.size() > 1) {
563  need_iteration = true;
564  }
565  }
566  }
567 
568  // Delete all Pqueries that were merged to others during this iteration.
569  // We need to do this in reverse order so that the indeces are not messed up while deleting.
570  for( auto it = del.rbegin(); it != del.rend(); ++it ) {
571  // Assert that this is an empty pquery. We cleared the ones that are marked for
572  // deletion, so in case that it is not empty, we are about to delete the wrong one!
573  assert( smp.at( *it ).placement_size() == 0 );
574  assert( smp.at( *it ).name_size() == 0 );
575 
576  smp.remove( *it );
577  }
578  }
579 }
580 
582 {
583  // Merge the placements into this map, indexed by the edge index they belong to.
584  // Also, store how many placements have been merged into this one.
585  std::unordered_map< size_t, std::pair< PqueryPlacement, size_t >> merge_units;
586 
587  for( auto pit = pquery.begin_placements(); pit != pquery.end_placements(); ++pit ) {
588  auto const& place = *pit;
589  auto edge_idx = place.edge().index();
590 
591  // For the first placement on each edge, make a copy.
592  if( merge_units.count( edge_idx ) == 0 ) {
593  merge_units[ edge_idx ] = { place, 0 };
594 
595  // For all others, add their values to the stored one.
596  } else {
597  auto& merge_into = merge_units[ edge_idx ].first;
598  ++merge_units[ edge_idx ].second;
599 
600  merge_into.likelihood += place.likelihood;
601  merge_into.like_weight_ratio += place.like_weight_ratio;
602  merge_into.proximal_length += place.proximal_length;
603  merge_into.pendant_length += place.pendant_length;
604  merge_into.parsimony += place.parsimony;
605  }
606  }
607 
608  // Clear all previous placements and add back the averaged merged ones.
609  pquery.clear_placements();
610  for( auto& merge_unit : merge_units ) {
611  auto& place = merge_unit.second.first;
612 
613  if( merge_unit.second.second > 1 ) {
614  double denom = static_cast<double>( merge_unit.second.second );
615 
616  place.likelihood /= denom;
617  place.like_weight_ratio /= denom;
618  place.proximal_length /= denom;
619  place.pendant_length /= denom;
620  place.parsimony /= denom;
621  }
622 
623  pquery.add_placement(place);
624  }
625 }
626 
628 {
629  for( auto pquery_it = smp.begin(); pquery_it != smp.end(); ++pquery_it ) {
630  merge_duplicate_placements( *pquery_it );
631  }
632 }
633 
635 {
636  // We add names to this map, indexed by their actual name string.
637  std::unordered_map<std::string, PqueryName> result;
638  for( auto name_it = pquery.begin_names(); name_it != pquery.end_names(); ++name_it ) {
639  auto const& name = *name_it;
640 
641  if( result.count( name.name ) == 0 ) {
642  result[ name.name ] = name;
643  } else {
644  result[ name.name ].multiplicity += name.multiplicity;
645  }
646  }
647 
648  // Now delete all names and re-populate using the map.
649  pquery.clear_names();
650  for( auto const& n : result ) {
651  pquery.add_name( n.second );
652  }
653 }
654 
656 {
657  for( auto pquery_it = smp.begin(); pquery_it != smp.end(); ++pquery_it ) {
658  merge_duplicate_names( *pquery_it );
659  }
660 }
661 
662 // =================================================================================================
663 // Placement Mass
664 // =================================================================================================
665 
666 double total_multiplicity( Pquery const& pqry )
667 {
668  double mult = 0.0;
669  for( auto const& name : pqry.names() ) {
670  mult += name.multiplicity;
671  }
672  return mult;
673 }
674 
675 double total_multiplicity( Sample const& sample )
676 {
677  double mult = 0.0;
678  for( auto const& pqry : sample.pqueries() ) {
679  mult += total_multiplicity( pqry );
680  }
681  return mult;
682 }
683 
684 size_t total_name_count( Sample const& smp )
685 {
686  size_t count = 0;
687  for( auto const& pqry : smp.pqueries() ) {
688  count += pqry.name_size();
689  }
690  return count;
691 }
692 
693 size_t total_placement_count( Sample const& smp )
694 {
695  size_t count = 0;
696  for( auto const& pqry : smp.pqueries() ) {
697  count += pqry.placement_size();
698  }
699  return count;
700 }
701 
702 double total_placement_mass( Sample const& smp )
703 {
704  double sum = 0.0;
705  for( const auto& pqry : smp.pqueries() ) {
706  for( auto& place : pqry.placements() ) {
707  sum += place.like_weight_ratio;
708  }
709  }
710  return sum;
711 }
712 
714 {
715  double sum = 0.0;
716  for( const auto& pqry : smp.pqueries() ) {
717  double mult = 0.0;
718  for( auto const& name : pqry.names() ) {
719  mult += name.multiplicity;
720  }
721 
722  double lwr_sum = 0.0;
723  for( auto const& place : pqry.placements() ) {
724  lwr_sum += place.like_weight_ratio;
725  }
726  sum += lwr_sum * mult;
727  }
728  return sum;
729 }
730 
731 std::pair<PlacementTreeEdge const*, size_t> placement_count_max_edge( Sample const& smp )
732 {
733  PlacementTreeEdge const* edge = nullptr;
734  size_t max = 0;
735 
736  auto place_map = placements_per_edge( smp );
737 
738  for( size_t edge_i = 0; edge_i < place_map.size(); ++edge_i ) {
739  if( place_map[ edge_i ].size() > max ) {
740  edge = &smp.tree().edge_at( edge_i );
741  max = place_map[ edge_i ].size();
742  }
743  }
744 
745  return std::make_pair(edge, max);
746 }
747 
748 std::pair<PlacementTreeEdge const*, double> placement_mass_max_edge( Sample const& smp )
749 {
750  PlacementTreeEdge const* edge = nullptr;
751  double max = 0.0;
752 
753  auto place_map = placements_per_edge( smp );
754 
755  for( size_t edge_i = 0; edge_i < place_map.size(); ++edge_i ) {
756  double sum = 0.0;
757  for( auto const& place : place_map[ edge_i ]) {
758  sum += place->like_weight_ratio;
759  }
760  if (sum > max) {
761  edge = &smp.tree().edge_at( edge_i );
762  max = sum;
763  }
764  }
765  return std::make_pair(edge, max);
766 }
767 
768 // =================================================================================================
769 // Histograms
770 // =================================================================================================
771 
772 std::vector<double> closest_leaf_weight_distribution( Sample const& sample )
773 {
774  std::vector<double> distrib;
775 
776  // Get a vector telling us the depth from each node to its closest leaf node.
777  auto depths = tree::closest_leaf_depth_vector( sample.tree() );
778 
779  for( auto const& pquery : sample.pqueries() ) {
780  for( auto& place : pquery.placements() ) {
781  // Try both nodes at the end of the placement's edge and see which one is closer
782  // to a leaf.
783  int dp = depths[ place.edge().primary_node().index() ].second;
784  int ds = depths[ place.edge().secondary_node().index() ].second;
785  unsigned int ld = std::min(dp, ds);
786 
787  // Put the closer one into the histogram, resize if necessary.
788  if( distrib.size() < ld + 1 ) {
789  distrib.resize( ld + 1, 0.0 );
790  }
791  distrib[ld] += place.like_weight_ratio;
792  }
793  }
794 
795  return distrib;
796 }
797 
798 // TODO use the histogram class of utils here!
799 
800 std::vector<int> closest_leaf_depth_histogram( Sample const& smp )
801 {
802  std::vector<int> hist;
803 
804  // Get a vector telling us the depth from each node to its closest leaf node.
805  auto depths = tree::closest_leaf_depth_vector( smp.tree() );
806 
807  for( auto const& pquery : smp.pqueries() ) {
808  for( auto& place : pquery.placements() ) {
809  // Try both nodes at the end of the placement's edge and see which one is closer
810  // to a leaf.
811  int dp = depths[ place.edge().primary_node().index() ].second;
812  int ds = depths[ place.edge().secondary_node().index() ].second;
813  unsigned int ld = std::min(dp, ds);
814 
815  // Put the closer one into the histogram, resize if necessary.
816  if (ld + 1 > hist.size()) {
817  hist.resize(ld + 1, 0);
818  }
819  ++hist[ld];
820  }
821  }
822 
823  return hist;
824 }
825 
827  Sample const& smp, const double min, const double max, const int bins
828 ) {
829  std::vector<int> hist(bins, 0);
830  double bin_size = (max - min) / bins;
831 
832  // get a vector telling us the distance from each node to its closest leaf node.
833  auto dists = tree::closest_leaf_distance_vector( smp.tree() );
834 
835  for (const auto& pquery : smp.pqueries()) {
836  for( auto& place : pquery.placements() ) {
837  // try both nodes at the end of the placement's edge and see which one is closer
838  // to a leaf.
839  double dp = place.pendant_length
840  + place.proximal_length
841  + dists[ place.edge().primary_node().index() ].second;
842  double ds = place.pendant_length
843  + place.edge().data<PlacementEdgeData>().branch_length
844  - place.proximal_length
845  + dists[ place.edge().secondary_node().index() ].second;
846  double ld = std::min(dp, ds);
847 
848  // find the right bin. if the distance value is outside the boundaries of [min;max],
849  // place it in the first or last bin, respectively.
850  int bin = static_cast <int> (std::floor( (ld - min) / bin_size ));
851  if (bin < 0) {
852  bin = 0;
853  }
854  if (bin >= bins) {
855  bin = bins - 1;
856  }
857  ++hist[bin];
858  }
859  }
860 
861  return hist;
862 }
863 
865  Sample const& smp, double& min, double& max, const int bins
866 ) {
867  std::vector<int> hist(bins, 0);
868 
869  // we do not know yet where the boundaries of the histogram lie, so we need to store all values
870  // first and find their min and max.
871  std::vector<double> distrib;
872  double min_d = 0.0;
873  double max_d = 0.0;
874 
875  // get a vector telling us the distance from each node to its closest leaf node.
876  auto dists = tree::closest_leaf_distance_vector( smp.tree() );
877 
878  // calculate all distances from placements to their closest leaf and store them.
879  for (const auto& pquery : smp.pqueries()) {
880  for( auto& place : pquery.placements() ) {
881  // try both nodes at the end of the placement's edge and see which one is closer
882  // to a leaf.
883  double dp = place.pendant_length
884  + place.proximal_length
885  + dists[ place.edge().primary_node().index() ].second;
886  double ds = place.pendant_length
887  + place.edge().data<PlacementEdgeData>().branch_length
888  - place.proximal_length
889  + dists[ place.edge().secondary_node().index() ].second;
890  double ld = std::min(dp, ds);
891  distrib.push_back(ld);
892 
893  // update min and max as needed (and in first iteration). we use std::nextafter() for
894  // the max in order to make sure that the max value is actually placed in the last bin.
895  if (distrib.size() == 1 || ld < min_d) {
896  min_d = ld;
897  }
898  if (distrib.size() == 1 || ld > max_d) {
899  max_d = std::nextafter(ld, ld + 1);
900  }
901  }
902  }
903 
904  // now we know min and max of the distances, so we can calculate the histogram.
905  double bin_size = (max_d - min_d) / bins;
906  for (double ld : distrib) {
907  int bin = static_cast <int> (std::floor( (ld - min_d) / bin_size ));
908  assert(bin >= 0 && bin < bins);
909  ++hist[bin];
910  }
911 
912  // report the min and max values to the calling function and return the histogram.
913  min = min_d;
914  max = max_d;
915  return hist;
916 }
917 
918 } // namespace placement
919 } // namespace genesis
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge...
PqueryName & name_at(size_t index)
Return the PqueryName at a certain index.
Definition: pquery.cpp:198
Provides some valuable algorithms that are not part of the C++ 11 STL.
std::vector< double > closest_leaf_weight_distribution(Sample const &sample)
size_t total_name_count(Sample const &smp)
Get the total number of PqueryNames in all Pqueries of the given Sample.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
void erase_if(Container &c, UnaryPredicate p)
Erases all elements from the container that satisfy a given predicate. An element is erased...
Definition: algorithm.hpp:70
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
void sort_placements_by_weight(Pquery &pquery)
Sort the PqueryPlacements of a Pquery by their like_weight_ratio, in descending order (most likely fi...
std::unordered_set< std::string > all_pquery_names(Sample const &sample)
Return a set of all unique PqueryNames of the Pqueries of the given sample.
void filter_n_max_weight_placements(Pquery &pquery, size_t n)
Remove all PqueryPlacements but the n most likely ones from the Pquery.
bool has_name(Pquery const &pquery, std::string const &name)
Return true iff the given Pquery contains a particular name.
size_t name_size() const
Return the number of PqueryNames stored in this Pquery.
Definition: pquery.cpp:193
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
void scale_all_branch_lengths(Sample &smp, double factor)
Scale all branch lengths of the Tree and the position of the PqueryPlacements by a given factor...
void filter_pqueries_differing_names(Sample &sample_1, Sample &sample_2)
Remove all Pqueries from the two Samples that have a name in common.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
iterator_names end_names()
Definition: pquery.cpp:147
Tree operator functions.
std::string name
Name for a Pquery.
Definition: name.hpp:118
PqueryName & add_name(std::string name="", double multiplicity=1.0)
Create a new PqueryName using the provided parameters, add it to the Pquery and return it...
Definition: pquery.cpp:181
void scale_all_branch_lengths(Tree &tree, double factor)
Scale all branch lengths of a Tree by a given factor.
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
Provides functions for working with Placements and Pqueries.
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1...
One placement position of a Pquery on a Tree.
double sum(const Histogram &h)
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:118
iterator_placements end_placements()
Definition: pquery.cpp:58
void clear_names()
Delete all PqueryNames of this Pquery.
Definition: pquery.cpp:213
std::vector< PqueryPlacement > & expose_placements()
Definition: pquery.cpp:83
std::pair< PlacementTreeEdge const *, size_t > placement_count_max_edge(Sample const &smp)
Get the number of placements on the edge with the most placements, and a pointer to this edge...
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
Provides some valuable additions to STD.
void filter_pqueries_keeping_names(Sample &smp, std::string const &regex)
Remove all Pqueries which do not have at least one name that matches the given regex.
void collect_duplicate_pqueries(Sample &smp)
Find all Pqueries that share a common name and combine them into a single Pquery containing all thei...
utils::Range< iterator_placements > placements()
Return a Range iterator to the PqueryPlacements.
Definition: pquery.cpp:73
Default class containing the commonly needed data for tree edges.
std::vector< int > closest_leaf_distance_histogram(Sample const &smp, const double min, const double max, const int bins)
Returns a histogram counting the number of placements that have a certain distance to their closest l...
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
std::vector< int > closest_leaf_distance_histogram_auto(Sample const &smp, double &min, double &max, const int bins)
Returns the same type of histogram as closest_leaf_distance_histogram(), but automatically determines...
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:113
std::pair< PlacementTreeEdge const *, double > placement_mass_max_edge(Sample const &smp)
Get the summed mass of the placements on the heaviest edge, measured by their like_weight_ratio, and a pointer to this edge.
void copy_pqueries(Sample const &source, Sample &target)
Copy all Pqueries from the source Sample (left parameter) to the target Sample (right parameter)...
void merge_duplicate_names(Pquery &pquery)
Merge all PqueryNames that have the same name property into one, while adding up their multiplicity...
iterator_names begin_names()
Definition: pquery.cpp:142
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
void filter_min_accumulated_weight(Pquery &pquery, double threshold)
Remove the PqueryPlacements with the lowest like_weight_ratio, while keeping the accumulated weight (...
void filter_min_weight_threshold(Pquery &pquery, double threshold)
Remove all PqueryPlacements that have a like_weight_ratio below the given threshold.
PqueryPlacement & add_placement(PlacementTreeEdge &edge)
Create a new PqueryPlacement at a given PlacementTreeEdge, add it to the Pquery and return it...
Definition: pquery.cpp:92
std::vector< int > closest_leaf_depth_histogram(Sample const &smp)
Return a histogram representing how many placements have which depth with respect to their closest le...
double total_placement_mass_with_multiplicities(Sample const &smp)
Get the mass of all PqueryPlacements of the Sample, using the multiplicities as factors.
iterator_placements begin_placements()
Definition: pquery.cpp:53
Header of Default Tree distance methods.
void adjust_branch_lengths(Sample &sample, tree::Tree const &source)
Take the branch lengths of the source Tree and use them as the new branch lengths of the sample...
iterator_pqueries begin()
Return an iterator to the beginning of the Pqueries of this Sample.
Definition: sample.cpp:239
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
Default Tree functions.
double total_placement_mass(Sample const &smp)
Get the summed mass of all PqueryPlacements in all Pqueries of the given Sample, where mass is measu...
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
void merge_duplicate_placements(Pquery &pquery)
Merge all PqueryPlacements of a Pquery that are on the same TreeEdge into one averaged PqueryPlacemen...
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
void filter_pqueries_intersecting_names(Sample &sample_1, Sample &sample_2)
Remove all Pqueries from the two Samples except the ones that have names in common.
std::vector< std::pair< TreeNode const *, double > > closest_leaf_distance_vector(Tree const &tree)
Return a vector containing the closest leaf node for each node, using the branch_length as distance m...
void clear_placements()
Delete all PqueryPlacements of this Pquery.
Definition: pquery.cpp:133
utils::Range< iterator_names > names()
Return a Range iterator to the PqueryNames.
Definition: pquery.cpp:162
Header of Tree distance methods.
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
size_t remove_empty_pqueries(Sample &sample)
Remove all Pqueries from the Sample that have no PqueryPlacements.
void remove(size_t index)
Remove the Pquery at a given index from the Sample.
Definition: sample.cpp:199
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:118
Pquery const * find_pquery(Sample const &smp, std::string const &name)
Return the first Pquery that has a particular name, or nullptr of none has.
size_t total_placement_count(Sample const &smp)
Get the total number of PqueryPlacements in all Pqueries of the given Sample.
void merge_duplicates(Sample &smp)
Look for Pqueries with the same name and merge them.
iterator_pqueries end()
Return an iterator to the end of the Pqueries of this Sample.
Definition: sample.cpp:249
Pquery & add()
Create an empty Pquery, add it to the Sample and return it.
Definition: sample.cpp:147
void filter_pqueries_removing_names(Sample &smp, std::string const &regex)
Remove all Pqueries which have at least one name that matches the given regex.