A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cmath>
45 #include <exception>
46 #include <regex>
47 #include <string>
48 #include <unordered_map>
49 #include <unordered_set>
50 #include <vector>
51 
52 #ifdef GENESIS_OPENMP
53 # include <omp.h>
54 #endif
55 
56 namespace genesis {
57 namespace placement {
58 
59 // =================================================================================================
60 // Pquery Names
61 // =================================================================================================
62 
63 bool has_name( Pquery const& pquery, std::string const& name )
64 {
65  for( size_t i = 0; i < pquery.name_size(); ++i ) {
66  if( pquery.name_at(i).name == name ) {
67  return true;
68  }
69  }
70  return false;
71 }
72 
73 bool has_name( Sample const& smp, std::string const& name )
74 {
75  for( auto const& pqry : smp.pqueries() ) {
76  if( has_name( pqry, name ) ) {
77  return true;
78  }
79  }
80  return false;
81 }
82 
83 Pquery const* find_pquery( Sample const& smp, std::string const& name )
84 {
85  // TODO instead of pointer, return an iterator!
86  // then use find if directly!
87  for( auto const& pqry : smp.pqueries() ) {
88  if( has_name( pqry, name )) {
89  return &pqry;
90  }
91  }
92  return nullptr;
93 }
94 
95 Pquery* find_pquery( Sample& smp, std::string const& name )
96 {
97  // TODO instead of pointer, return an iterator!
98  // then use find if directly!
99  for( auto& pqry : smp.pqueries() ) {
100  if( has_name( pqry, name )) {
101  return &pqry;
102  }
103  }
104  return nullptr;
105 }
106 
107 std::unordered_set<std::string> all_pquery_names( Sample const& sample )
108 {
109  std::unordered_set<std::string> result;
110  for( auto const& pquery : sample ) {
111  for( auto const& pname : pquery.names() ) {
112  result.insert( pname.name );
113  }
114  }
115  return result;
116 }
117 
118 // =================================================================================================
119 // Normalization and Sorting
120 // =================================================================================================
121 
123 {
124  double sum = 0.0;
125  for( auto const& place : pquery.placements() ) {
126  sum += place.like_weight_ratio;
127  }
128  if( sum == 0.0 ) {
129  throw std::overflow_error( "Cannot normalize weight ratios if all of them are zero." );
130  }
131  for( auto& place : pquery.placements() ) {
132  place.like_weight_ratio /= sum;
133  }
134 }
135 
137 {
138  for( auto pqry_it = smp.begin(); pqry_it != smp.end(); ++pqry_it ) {
139  normalize_weight_ratios( *pqry_it );
140  }
141 }
142 
143 // void sort_placements_by_proximal_length( PlacementTreeEdge& edge );
144 // void sort_placements_by_proximal_length( Sample& smp );
145 
147 {
148  std::sort(
149  pquery.begin_placements(),
150  pquery.end_placements(),
151  [] (
152  PqueryPlacement const& lhs,
153  PqueryPlacement const& rhs
154  ) {
155  return lhs.like_weight_ratio > rhs.like_weight_ratio;
156  }
157  );
158 }
159 
161 {
162  for( auto pqry_it = smp.begin(); pqry_it != smp.end(); ++pqry_it ) {
163  sort_placements_by_weight( *pqry_it );
164  }
165 }
166 
167 void scale_all_branch_lengths( Sample& smp, double factor )
168 {
169  tree::scale_all_branch_lengths( smp.tree(), factor );
170  for( auto& pqry : smp.pqueries() ) {
171  for( auto& place : pqry.placements() ) {
172  place.proximal_length *= factor;
173  }
174  }
175 }
176 
177 void adjust_branch_lengths( Sample& sample, tree::Tree const& source )
178 {
179  // Basic error check.
180  if( sample.tree().edge_count() != source.edge_count() ) {
181  throw std::runtime_error( "Sizes of the Sample::tree() and source tree differ." );
182  }
183 
184  // First, adjust the placement positions,
185  // because we need the old branch lengths of the sample for correct scaling.
186  #pragma omp parallel for
187  for( size_t i = 0; i < sample.size(); ++i ) {
188  auto& pquery = sample.at(i);
189 
190  for( auto& placement : pquery.placements() ) {
191  auto const edge_idx = placement.edge().index();
192  auto const old_bl = placement.edge().data<PlacementEdgeData>().branch_length;
193  auto const new_bl = source.edge_at(edge_idx).data<tree::DefaultEdgeData>().branch_length;
194 
195  placement.proximal_length *= new_bl / old_bl;
196  }
197  }
198 
199  // Now adjust the reference tree branch lengths.
200  #pragma omp parallel for
201  for( size_t edge_idx = 0; edge_idx < source.edge_count(); ++edge_idx ) {
202  auto const new_bl = source.edge_at(edge_idx).data<tree::DefaultEdgeData>().branch_length;
203  sample.tree().edge_at(edge_idx).data<PlacementEdgeData>().branch_length = new_bl;
204  }
205 }
206 
207 // =================================================================================================
208 // Filtering
209 // =================================================================================================
210 
211 void filter_min_accumulated_weight( Pquery& pquery, double threshold )
212 {
213  // Sort, so that the most likely placements are in the beginning.
214  sort_placements_by_weight( pquery );
215 
216  // Find the position where enough weight is accumulated.
217  size_t i = 0 ;
218  double weight_sum = 0.0;
219  do {
220  weight_sum += pquery.placement_at(i).like_weight_ratio;
221  ++i;
222  } while( weight_sum < threshold && i < pquery.placement_size() );
223 
224  // Remove the rest.
225  while( pquery.placement_size() > i ) {
226  pquery.remove_placement_at( pquery.placement_size() - 1 );
227  }
228 }
229 
230 void filter_min_accumulated_weight( Sample& smp, double threshold )
231 {
232  for( auto& pquery : smp ) {
233  filter_min_accumulated_weight( pquery, threshold );
234  }
235 }
236 
237 void filter_n_max_weight_placements( Pquery& pquery, size_t n )
238 {
239  // Check if there is anything to do at all.
240  if( pquery.placement_size() == 0 || pquery.placement_size() <= n ) {
241  return;
242  }
243 
244  // Sort and remove elements from the back until n are left
245  // (this is not the cleanest solution, but works for now).
246  sort_placements_by_weight( pquery );
247  while( pquery.placement_size() > n ) {
248  pquery.remove_placement_at( pquery.placement_size() - 1 );
249  }
250 }
251 
253 {
254  for( auto& pquery : smp ) {
255  filter_n_max_weight_placements( pquery, n );
256  }
257 }
258 
259 void filter_min_weight_threshold( Pquery& pquery, double threshold )
260 {
261  // This is certainly not the cleanest solution, as it might cause quite some relocations.
262  // However, we'd need full write access to the placement vector, which is currently not
263  // supported for reasons of encapsulation...
264  size_t i = 0;
265  while( i < pquery.placement_size() ) {
266  if( pquery.placement_at(i).like_weight_ratio < threshold ) {
267  pquery.remove_placement_at(i);
268  } else {
269  ++i;
270  }
271  }
272 }
273 
274 void filter_min_weight_threshold( Sample& smp, double threshold )
275 {
276  for( auto& pquery : smp ) {
277  filter_min_weight_threshold( pquery, threshold );
278  }
279 }
280 
281 void filter_pqueries_keeping_names( Sample& smp, std::string const& regex )
282 {
283  std::regex pattern( regex );
284  auto new_past_the_end = std::remove_if(
285  smp.begin(),
286  smp.end(),
287  [&] ( Pquery const& pqry ) {
288  for( auto const& nm : pqry.names() ) {
289  if( regex_match( nm.name, pattern ) ) {
290  return false;
291  }
292  }
293  return true;
294  }
295  );
296  smp.remove( new_past_the_end, smp.end() );
297 }
298 
299 void filter_pqueries_keeping_names( Sample& smp, std::unordered_set<std::string> keep_list )
300 {
301  auto new_past_the_end = std::remove_if(
302  smp.begin(),
303  smp.end(),
304  [&] ( Pquery const& pqry ) {
305  for( auto const& nm : pqry.names() ) {
306  if( keep_list.count( nm.name ) > 0 ) {
307  return false;
308  }
309  }
310  return true;
311  }
312  );
313  smp.remove( new_past_the_end, smp.end() );
314 }
315 
316 void filter_pqueries_removing_names( Sample& smp, std::string const& regex )
317 {
318  std::regex pattern( regex );
319  auto new_past_the_end = std::remove_if(
320  smp.begin(),
321  smp.end(),
322  [&] ( Pquery const& pqry ) {
323  for( auto const& nm : pqry.names() ) {
324  if( regex_match( nm.name, pattern ) ) {
325  return true;
326  }
327  }
328  return false;
329  }
330  );
331  smp.remove( new_past_the_end, smp.end() );
332 }
333 
334 void filter_pqueries_removing_names( Sample& smp, std::unordered_set<std::string> remove_list )
335 {
336  auto new_past_the_end = std::remove_if(
337  smp.begin(),
338  smp.end(),
339  [&] ( Pquery const& pqry ) {
340  for( auto const& nm : pqry.names() ) {
341  if( remove_list.count( nm.name ) > 0 ) {
342  return true;
343  }
344  }
345  return false;
346  }
347  );
348  smp.remove( new_past_the_end, smp.end() );
349 }
350 
352 {
353  // Remove those pqueries from sample_2 which do not occur in sample_1.
354  auto names = all_pquery_names( sample_1 );
355  filter_pqueries_keeping_names( sample_2, names );
356 
357  // And vice versa (using the names of the already smaller sample_2).
358  names = all_pquery_names( sample_2 );
359  filter_pqueries_keeping_names( sample_1, names );
360 }
361 
362 void filter_pqueries_differing_names( Sample& sample_1, Sample& sample_2 )
363 {
364  // Yep, yep, this is wasteful. But it works for now.
365 
366  // Get all sorted names in sample 1 ...
367  auto names_1_u = all_pquery_names( sample_1 );
368  auto names_1 = std::vector< std::string >( names_1_u.begin(), names_1_u.end() );
369  std::sort( names_1.begin(), names_1.end() );
370  names_1_u.clear();
371 
372  // ... and in sample 2.
373  auto names_2_u = all_pquery_names( sample_2 );
374  auto names_2 = std::vector< std::string >( names_2_u.begin(), names_2_u.end() );
375  std::sort( names_2.begin(), names_2.end() );
376  names_2_u.clear();
377 
378  // Get their intersection.
379  std::unordered_set< std::string > symdiff;
380  std::set_intersection(
381  names_1.begin(), names_1.end(),
382  names_2.begin(), names_2.end(),
383  std::inserter( symdiff, symdiff.end() )
384  );
385 
386  // Remove all intersecting elements from the sampels.
387  filter_pqueries_removing_names( sample_1, symdiff );
388  filter_pqueries_removing_names( sample_2, symdiff );
389 }
390 
391 size_t remove_empty_pqueries( Sample& sample )
392 {
393  size_t i = 0;
394  size_t r = 0;
395  while( i < sample.size() ) {
396  if( sample.at(i).placement_size() == 0 ) {
397  sample.remove( i );
398  ++r;
399  } else {
400  ++i;
401  }
402  }
403  return r;
404 }
405 
406 // =================================================================================================
407 // Joining and Merging
408 // =================================================================================================
409 
410 void copy_pqueries( Sample const& source, Sample& target )
411 {
412  // Check for identical topology, taxa names and edge_nums.
413  // We do not check here for branch_length, because usually those differ slightly.
414  if( ! compatible_trees( source.tree(), target.tree() )) {
415  throw std::runtime_error("Cannot join Samples, because their PlacementTrees differ.");
416  }
417 
418  // We need to assign edge pointers to the correct edge objects, so we need a mapping.
419  auto en_map = edge_num_to_edge_map( target.tree() );
420 
421  // Copy all (o)ther pqueries to (n)ew pqueries.
422  for( const auto& opqry : source.pqueries() ) {
423  auto npqry = Pquery();
424  for( auto opit = opqry.begin_placements(); opit != opqry.end_placements(); ++opit ) {
425 
426  // Assuming that the trees have identical topology (checked at the beginning of this
427  // function), there will be an edge for every placement. if this assertion fails,
428  // something broke the integrity of our in memory representation of the data.
429  assert( en_map.count( opit->edge_num() ) > 0 );
430 
431  npqry.add_placement( *en_map[ opit->edge_num() ], *opit );
432  }
433  for( auto name_it = opqry.begin_names(); name_it != opqry.end_names(); ++name_it ) {
434  npqry.add_name( *name_it );
435  }
436  target.add( npqry );
437  }
438 }
439 
441 {
443  merge_duplicate_names (smp);
445 }
446 
448 {
449  // We are looking for the transitive closure of all Pqueries that pairwise share a common name.
450  // In a graph theory setting, this could be depicted as follows:
451  // Each Pquery is a node, and it has an edge to other nodes iff they share a common name.
452  // For this, there exist algorithms like Floyd–Warshall. However, they are an overkill for the
453  // problem at hand, in terms of the complexity of both code and runtime requirements.
454  // From the biological data and the way Evolutionary Placement works, we do not expect to have
455  // many edges in this graph. Thus, this function uses a simpler solution: repeated search.
456  // Worst case, this needs as many iterations over all Pqueries as the longest chain of shared
457  // names. Pqueries with shared names like (a,b) (b,c) (c,d) (d,e) might need four iterations,
458  // depending on the order of those Pqueries. This is acceptable, as this case should be rare.
459 
460  // Iterate as long as needed to merge all transitive connections between Pqueries that share
461  // names. We need as many iterations as the longest chain of connected Pqueries.
462  bool need_iteration = true;
463  while (need_iteration) {
464  need_iteration = false;
465 
466  // A hash smp that contains the already processed names and links them to their pquery.
467  // Whenever the same name is found again, the associated placements will be copied to the
468  // stored Pquery and the then surplus Pquery will be deleted.
469  std::unordered_map<std::string, Pquery*> hash;
470 
471  // This is a list of the Pquery indices that we want to delete, because their placements
472  // have been moved to other Pqueries (those, which contain the same name). We need this
473  // list as deleting from a list while iterating it is not a good idea.
474  std::vector<size_t> del;
475 
476  for( size_t i = 0; i < smp.size(); ++i ) {
477  auto& pqry = smp.at(i);
478 
479  // Collect the Pqueries that can be merged with the current one, because they share
480  // a common name.
481  std::unordered_set<Pquery*> merges;
482  for( auto name_it = pqry.begin_names(); name_it != pqry.end_names(); ++name_it ) {
483  auto& name = *name_it;
484 
485  if (hash.count(name.name) > 0) {
486  merges.insert(hash[name.name]);
487  }
488  }
489 
490  if (merges.size() == 0) {
491  // All names are new, so store them in the hash smp for later.
492  // We don't need to do any merging in this case.
493  for( auto name_it = pqry.begin_names(); name_it != pqry.end_names(); ++name_it ) {
494  auto& name = *name_it;
495 
496  // We are here because we found no pquery to merge with. This means that the
497  // name cannot be in the hash smp already.
498  assert(hash.count(name.name) == 0);
499 
500  // Now add it.
501  hash[name.name] = &pqry;
502  }
503  } else {
504  // We need merging. We will merge with only one Pquery in this iteration. If there
505  // are more than one Pqueries that we need to merge with (i.e., merges.size() > 1),
506  // we will mark that we need another iteration and do the other mergings then.
507 
508  // Get the Pquery that we will merge into.
509  Pquery* merge_into = *merges.begin();
510 
511  // TODO we could use move here instead of copy creating.
512 
513  // Add all placements to it.
514  for( auto const& place : pqry.placements() ) {
515  merge_into->add_placement( place );
516  }
517 
518  // Add all names. This will cause doubled names, but they can be reduced later
519  // via merge_duplicate_names().
520  // We could do the check here, but this would increase complexity and gain just a
521  // bit of speed (probably).
522  for( auto const& name : pqry.names() ) {
523  // Add the name to the Pquery.
524  merge_into->add_name( name );
525 
526  // Also add it to the hash smp. If this was name that is responsible for the
527  // merging (the shared name), we will replace the entry by an indentical copy.
528  // For all other names of the current Pquery, we need this to ensure that the
529  // next Pqueries in the loop will find it.
530  hash[name.name] = merge_into;
531  }
532 
533  // Mark the Pquery for deletion and delete its content
534  // (this is both to save memory, but also for some assertions later).
535  del.push_back(i);
536  pqry.clear();
537 
538  // Check whether we need to merge with more than one Pquery, meaning that we have
539  // transitive connections. This means, we need another iteration to resolve this.
540  if (merges.size() > 1) {
541  need_iteration = true;
542  }
543  }
544  }
545 
546  // Delete all Pqueries that were merged to others during this iteration.
547  // We need to do this in reverse order so that the indeces are not messed up while deleting.
548  for( auto it = del.rbegin(); it != del.rend(); ++it ) {
549  // Assert that this is an empty pquery. We cleared the ones that are marked for
550  // deletion, so in case that it is not empty, we are about to delete the wrong one!
551  assert( smp.at( *it ).placement_size() == 0 );
552  assert( smp.at( *it ).name_size() == 0 );
553 
554  smp.remove( *it );
555  }
556  }
557 }
558 
560 {
561  // Merge the placements into this map, indexed by the edge index they belong to.
562  // Also, store how many placements have been merged into this one.
563  std::unordered_map< size_t, std::pair< PqueryPlacement, size_t >> merge_units;
564 
565  for( auto pit = pquery.begin_placements(); pit != pquery.end_placements(); ++pit ) {
566  auto const& place = *pit;
567  auto edge_idx = place.edge().index();
568 
569  // For the first placement on each edge, make a copy.
570  if( merge_units.count( edge_idx ) == 0 ) {
571  merge_units[ edge_idx ] = { place, 0 };
572 
573  // For all others, add their values to the stored one.
574  } else {
575  auto& merge_into = merge_units[ edge_idx ].first;
576  ++merge_units[ edge_idx ].second;
577 
578  merge_into.likelihood += place.likelihood;
579  merge_into.like_weight_ratio += place.like_weight_ratio;
580  merge_into.proximal_length += place.proximal_length;
581  merge_into.pendant_length += place.pendant_length;
582  merge_into.parsimony += place.parsimony;
583  }
584  }
585 
586  // Clear all previous placements and add back the averaged merged ones.
587  pquery.clear_placements();
588  for( auto& merge_unit : merge_units ) {
589  auto& place = merge_unit.second.first;
590 
591  if( merge_unit.second.second > 1 ) {
592  double denom = static_cast<double>( merge_unit.second.second );
593 
594  place.likelihood /= denom;
595  place.like_weight_ratio /= denom;
596  place.proximal_length /= denom;
597  place.pendant_length /= denom;
598  place.parsimony /= denom;
599  }
600 
601  pquery.add_placement(place);
602  }
603 }
604 
606 {
607  for( auto pquery_it = smp.begin(); pquery_it != smp.end(); ++pquery_it ) {
608  merge_duplicate_placements( *pquery_it );
609  }
610 }
611 
613 {
614  // We add names to this map, indexed by their actual name string.
615  std::unordered_map<std::string, PqueryName> result;
616  for( auto name_it = pquery.begin_names(); name_it != pquery.end_names(); ++name_it ) {
617  auto const& name = *name_it;
618 
619  if( result.count( name.name ) == 0 ) {
620  result[ name.name ] = name;
621  } else {
622  result[ name.name ].multiplicity += name.multiplicity;
623  }
624  }
625 
626  // Now delete all names and re-populate using the map.
627  pquery.clear_names();
628  for( auto const& n : result ) {
629  pquery.add_name( n.second );
630  }
631 }
632 
634 {
635  for( auto pquery_it = smp.begin(); pquery_it != smp.end(); ++pquery_it ) {
636  merge_duplicate_names( *pquery_it );
637  }
638 }
639 
640 // =================================================================================================
641 // Placement Mass
642 // =================================================================================================
643 
644 double total_multiplicity( Pquery const& pqry )
645 {
646  double mult = 0.0;
647  for( auto const& name : pqry.names() ) {
648  mult += name.multiplicity;
649  }
650  return mult;
651 }
652 
653 double total_multiplicity( Sample const& sample )
654 {
655  double mult = 0.0;
656  for( auto const& pqry : sample.pqueries() ) {
657  mult += total_multiplicity( pqry );
658  }
659  return mult;
660 }
661 
662 size_t total_placement_count( Sample const& smp )
663 {
664  size_t count = 0;
665  for( auto const& pqry : smp.pqueries() ) {
666  count += pqry.placement_size();
667  }
668  return count;
669 }
670 
671 double total_placement_mass( Sample const& smp )
672 {
673  double sum = 0.0;
674  for( const auto& pqry : smp.pqueries() ) {
675  for( auto& place : pqry.placements() ) {
676  sum += place.like_weight_ratio;
677  }
678  }
679  return sum;
680 }
681 
683 {
684  double sum = 0.0;
685  for( const auto& pqry : smp.pqueries() ) {
686  double mult = 0.0;
687  for( auto const& name : pqry.names() ) {
688  mult += name.multiplicity;
689  }
690 
691  double lwr_sum = 0.0;
692  for( auto const& place : pqry.placements() ) {
693  lwr_sum += place.like_weight_ratio;
694  }
695  sum += lwr_sum * mult;
696  }
697  return sum;
698 }
699 
700 std::pair<PlacementTreeEdge const*, size_t> placement_count_max_edge( Sample const& smp )
701 {
702  PlacementTreeEdge const* edge = nullptr;
703  size_t max = 0;
704 
705  auto place_map = placements_per_edge( smp );
706 
707  for( size_t edge_i = 0; edge_i < place_map.size(); ++edge_i ) {
708  if( place_map[ edge_i ].size() > max ) {
709  edge = &smp.tree().edge_at( edge_i );
710  max = place_map[ edge_i ].size();
711  }
712  }
713 
714  return std::make_pair(edge, max);
715 }
716 
717 std::pair<PlacementTreeEdge const*, double> placement_mass_max_edge( Sample const& smp )
718 {
719  PlacementTreeEdge const* edge = nullptr;
720  double max = 0.0;
721 
722  auto place_map = placements_per_edge( smp );
723 
724  for( size_t edge_i = 0; edge_i < place_map.size(); ++edge_i ) {
725  double sum = 0.0;
726  for( auto const& place : place_map[ edge_i ]) {
727  sum += place->like_weight_ratio;
728  }
729  if (sum > max) {
730  edge = &smp.tree().edge_at( edge_i );
731  max = sum;
732  }
733  }
734  return std::make_pair(edge, max);
735 }
736 
737 // =================================================================================================
738 // Histograms
739 // =================================================================================================
740 
741 std::vector<double> closest_leaf_weight_distribution( Sample const& sample )
742 {
743  std::vector<double> distrib;
744 
745  // Get a vector telling us the depth from each node to its closest leaf node.
746  auto depths = tree::closest_leaf_depth_vector( sample.tree() );
747 
748  for( auto const& pquery : sample.pqueries() ) {
749  for( auto& place : pquery.placements() ) {
750  // Try both nodes at the end of the placement's edge and see which one is closer
751  // to a leaf.
752  int dp = depths[ place.edge().primary_node().index() ].second;
753  int ds = depths[ place.edge().secondary_node().index() ].second;
754  unsigned int ld = std::min(dp, ds);
755 
756  // Put the closer one into the histogram, resize if necessary.
757  if( distrib.size() < ld + 1 ) {
758  distrib.resize( ld + 1, 0.0 );
759  }
760  distrib[ld] += place.like_weight_ratio;
761  }
762  }
763 
764  return distrib;
765 }
766 
767 // TODO use the histogram class of utils here!
768 
769 std::vector<int> closest_leaf_depth_histogram( Sample const& smp )
770 {
771  std::vector<int> hist;
772 
773  // Get a vector telling us the depth from each node to its closest leaf node.
774  auto depths = tree::closest_leaf_depth_vector( smp.tree() );
775 
776  for( auto const& pquery : smp.pqueries() ) {
777  for( auto& place : pquery.placements() ) {
778  // Try both nodes at the end of the placement's edge and see which one is closer
779  // to a leaf.
780  int dp = depths[ place.edge().primary_node().index() ].second;
781  int ds = depths[ place.edge().secondary_node().index() ].second;
782  unsigned int ld = std::min(dp, ds);
783 
784  // Put the closer one into the histogram, resize if necessary.
785  if (ld + 1 > hist.size()) {
786  hist.resize(ld + 1, 0);
787  }
788  ++hist[ld];
789  }
790  }
791 
792  return hist;
793 }
794 
796  Sample const& smp, const double min, const double max, const int bins
797 ) {
798  std::vector<int> hist(bins, 0);
799  double bin_size = (max - min) / bins;
800 
801  // get a vector telling us the distance from each node to its closest leaf node.
802  auto dists = tree::closest_leaf_distance_vector( smp.tree() );
803 
804  for (const auto& pquery : smp.pqueries()) {
805  for( auto& place : pquery.placements() ) {
806  // try both nodes at the end of the placement's edge and see which one is closer
807  // to a leaf.
808  double dp = place.pendant_length
809  + place.proximal_length
810  + dists[ place.edge().primary_node().index() ].second;
811  double ds = place.pendant_length
812  + place.edge().data<PlacementEdgeData>().branch_length
813  - place.proximal_length
814  + dists[ place.edge().secondary_node().index() ].second;
815  double ld = std::min(dp, ds);
816 
817  // find the right bin. if the distance value is outside the boundaries of [min;max],
818  // place it in the first or last bin, respectively.
819  int bin = static_cast <int> (std::floor( (ld - min) / bin_size ));
820  if (bin < 0) {
821  bin = 0;
822  }
823  if (bin >= bins) {
824  bin = bins - 1;
825  }
826  ++hist[bin];
827  }
828  }
829 
830  return hist;
831 }
832 
834  Sample const& smp, double& min, double& max, const int bins
835 ) {
836  std::vector<int> hist(bins, 0);
837 
838  // we do not know yet where the boundaries of the histogram lie, so we need to store all values
839  // first and find their min and max.
840  std::vector<double> distrib;
841  double min_d = 0.0;
842  double max_d = 0.0;
843 
844  // get a vector telling us the distance from each node to its closest leaf node.
845  auto dists = tree::closest_leaf_distance_vector( smp.tree() );
846 
847  // calculate all distances from placements to their closest leaf and store them.
848  for (const auto& pquery : smp.pqueries()) {
849  for( auto& place : pquery.placements() ) {
850  // try both nodes at the end of the placement's edge and see which one is closer
851  // to a leaf.
852  double dp = place.pendant_length
853  + place.proximal_length
854  + dists[ place.edge().primary_node().index() ].second;
855  double ds = place.pendant_length
856  + place.edge().data<PlacementEdgeData>().branch_length
857  - place.proximal_length
858  + dists[ place.edge().secondary_node().index() ].second;
859  double ld = std::min(dp, ds);
860  distrib.push_back(ld);
861 
862  // update min and max as needed (and in first iteration). we use std::nextafter() for
863  // the max in order to make sure that the max value is actually placed in the last bin.
864  if (distrib.size() == 1 || ld < min_d) {
865  min_d = ld;
866  }
867  if (distrib.size() == 1 || ld > max_d) {
868  max_d = std::nextafter(ld, ld + 1);
869  }
870  }
871  }
872 
873  // now we know min and max of the distances, so we can calculate the histogram.
874  double bin_size = (max_d - min_d) / bins;
875  for (double ld : distrib) {
876  int bin = static_cast <int> (std::floor( (ld - min_d) / bin_size ));
877  assert(bin >= 0 && bin < bins);
878  ++hist[bin];
879  }
880 
881  // report the min and max values to the calling function and return the histogram.
882  min = min_d;
883  max = max_d;
884  return hist;
885 }
886 
887 } // namespace placement
888 } // namespace genesis
std::vector< std::vector< PqueryPlacement const * > > placements_per_edge(Sample const &smp, bool only_max_lwr_placements)
Return a mapping from each PlacementTreeEdges to the PqueryPlacements that are placed on that edge...
PqueryName & name_at(size_t index)
Return the PqueryName at a certain index.
Definition: pquery.cpp:283
std::vector< double > closest_leaf_weight_distribution(Sample const &sample)
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
void sort_placements_by_weight(Pquery &pquery)
Sort the PqueryPlacements of a Pquery by their like_weight_ratio, in descending order (most likely fi...
std::unordered_set< std::string > all_pquery_names(Sample const &sample)
Return a set of all unique PqueryNames of the Pqueries of the given sample.
void filter_n_max_weight_placements(Pquery &pquery, size_t n)
Remove all PqueryPlacements but the n most likely ones from the Pquery.
bool has_name(Pquery const &pquery, std::string const &name)
Return true iff the given Pquery contains a particular name.
size_t name_size() const
Return the number of PqueryNames stored in this Pquery.
Definition: pquery.cpp:273
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
void scale_all_branch_lengths(Sample &smp, double factor)
Scale all branch lengths of the Tree and the position of the PqueryPlacements by a given factor...
void filter_pqueries_differing_names(Sample &sample_1, Sample &sample_2)
Remove all Pqueries from the two Samples that have a name in common.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
iterator_names end_names()
Definition: pquery.cpp:207
Tree operator functions.
std::string name
Name for a Pquery.
Definition: name.hpp:104
PqueryName & add_name(std::string name="", double multiplicity=1.0)
Create a new PqueryName using the provided parameters, add it to the Pquery and return it...
Definition: pquery.cpp:252
void scale_all_branch_lengths(Tree &tree, double factor)
Scale all branch lengths of a Tree by a given factor.
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
void remove_placement_at(size_t index)
Remove the PqueryPlacement at a certain index position within this Pquery.
Definition: pquery.cpp:185
Provides functions for working with Placements and Pqueries.
void normalize_weight_ratios(Pquery &pquery)
Recalculate the like_weight_ratio of the PqueryPlacement&s of a Pquery, so that their sum is 1...
One placement position of a Pquery on a Tree.
double sum(const Histogram &h)
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:164
iterator_placements end_placements()
Definition: pquery.cpp:62
void clear_names()
Delete all PqueryNames of this Pquery.
Definition: pquery.cpp:312
std::pair< PlacementTreeEdge const *, size_t > placement_count_max_edge(Sample const &smp)
Get the number of placements on the edge with the most placements, and a pointer to this edge...
Provides some valuable additions to STD.
void filter_pqueries_keeping_names(Sample &smp, std::string const &regex)
Remove all Pqueries which do not have at least one name that matches the given regex.
void collect_duplicate_pqueries(Sample &smp)
Find all Pqueries that share a common name and combine them into a single Pquery containing all thei...
utils::Range< iterator_placements > placements()
Return a Range iterator to the PqueryPlacements.
Definition: pquery.cpp:82
Default class containing the commonly needed data for tree edges.
std::vector< int > closest_leaf_distance_histogram(Sample const &smp, const double min, const double max, const int bins)
Returns a histogram counting the number of placements that have a certain distance to their closest l...
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
std::vector< int > closest_leaf_distance_histogram_auto(Sample const &smp, double &min, double &max, const int bins)
Returns the same type of histogram as closest_leaf_distance_histogram(), but automatically determines...
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:154
std::pair< PlacementTreeEdge const *, double > placement_mass_max_edge(Sample const &smp)
Get the summed mass of the placements on the heaviest edge, measured by their like_weight_ratio, and a pointer to this edge.
void copy_pqueries(Sample const &source, Sample &target)
Copy all Pqueries from the source Sample (left parameter) to the target Sample (right parameter)...
void merge_duplicate_names(Pquery &pquery)
Merge all PqueryNames that have the same name property into one, while adding up their multiplicity...
iterator_names begin_names()
Definition: pquery.cpp:202
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
void filter_min_accumulated_weight(Pquery &pquery, double threshold)
Remove the PqueryPlacements with the lowest like_weight_ratio, while keeping the accumulated weight (...
void filter_min_weight_threshold(Pquery &pquery, double threshold)
Remove all PqueryPlacements that have a like_weight_ratio below the given threshold.
PqueryPlacement & add_placement(PlacementTreeEdge &edge)
Create a new PqueryPlacement at a given PlacementTreeEdge, add it to the Pquery and return it...
Definition: pquery.cpp:113
std::vector< int > closest_leaf_depth_histogram(Sample const &smp)
Return a histogram representing how many placements have which depth with respect to their closest le...
double total_placement_mass_with_multiplicities(Sample const &smp)
Get the mass of all PqueryPlacements of the Sample, using the multiplicities as factors.
iterator_placements begin_placements()
Definition: pquery.cpp:57
Header of Default Tree distance methods.
void adjust_branch_lengths(Sample &sample, tree::Tree const &source)
Take the branch lengths of the source Tree and use them as the new branch lengths of the sample...
iterator_pqueries begin()
Return an iterator to the beginning of the Pqueries of this Sample.
Definition: sample.cpp:239
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
Default Tree functions.
double total_placement_mass(Sample const &smp)
Get the summed mass of all PqueryPlacements in all Pqueries of the given Sample, where mass is measu...
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
void merge_duplicate_placements(Pquery &pquery)
Merge all PqueryPlacements of a Pquery that are on the same TreeEdge into one averaged PqueryPlacemen...
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
void filter_pqueries_intersecting_names(Sample &sample_1, Sample &sample_2)
Remove all Pqueries from the two Samples except the ones that have names in common.
std::vector< std::pair< TreeNode const *, double > > closest_leaf_distance_vector(Tree const &tree)
Return a vector containing the closest leaf node for each node, using the branch_length as distance m...
void clear_placements()
Delete all PqueryPlacements of this Pquery.
Definition: pquery.cpp:193
utils::Range< iterator_names > names()
Return a Range iterator to the PqueryNames.
Definition: pquery.cpp:227
Header of Tree distance methods.
std::unordered_map< int, PlacementTreeEdge * > edge_num_to_edge_map(PlacementTree const &tree)
Return a mapping of edge_num integers to the corresponding PlacementTreeEdge object.
size_t remove_empty_pqueries(Sample &sample)
Remove all Pqueries from the Sample that have no PqueryPlacements.
void remove(size_t index)
Remove the Pquery at a given index from the Sample.
Definition: sample.cpp:199
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:118
Pquery const * find_pquery(Sample const &smp, std::string const &name)
Return the first Pquery that has a particular name, or nullptr of none has.
size_t total_placement_count(Sample const &smp)
Get the total number of PqueryPlacements in all Pqueries of the given Sample.
void merge_duplicates(Sample &smp)
Look for Pqueries with the same name and merge them.
iterator_pqueries end()
Return an iterator to the end of the Pqueries of this Sample.
Definition: sample.cpp:249
Pquery & add()
Create an empty Pquery, add it to the Sample and return it.
Definition: sample.cpp:147
void filter_pqueries_removing_names(Sample &smp, std::string const &regex)
Remove all Pqueries which have at least one name that matches the given regex.