A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/simulator/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
38 
41 
45 
46 #include <assert.h>
47 #include <cmath>
48 #include <ostream>
49 #include <stdexcept>
50 #include <vector>
51 
52 namespace genesis {
53 namespace placement {
54 
55 // =================================================================================================
56 // Stream Output Operators
57 // =================================================================================================
58 
59 std::ostream& operator << ( std::ostream& out, SimulatorEdgeDistribution const& distrib )
60 {
61  out << "Weight of each edge: ";
62  out << utils::join( distrib.edge_weights, ", ") << "\n";
63  return out;
64 }
65 
66 std::ostream& operator << ( std::ostream& out, SimulatorExtraPlacementDistribution const& distrib )
67 {
68  out << "Extra placement weights:\n";
69  for( size_t i = 0; i < distrib.placement_number_weights.size(); ++i ) {
70  out << i << ": " << distrib.placement_number_weights[i] << "\n";
71  }
72  out << "Path length weights:\n";
73  for( size_t i = 0; i < distrib.placement_path_length_weights.size(); ++i ) {
74  out << i << ": " << distrib.placement_path_length_weights[i] << "\n";
75  }
76  return out;
77 }
78 
79 std::ostream& operator << ( std::ostream& out, SimulatorLikeWeightRatioDistribution const& distrib )
80 {
81  if( distrib.intervals.size() != distrib.weights.size() ) {
82  throw std::logic_error( "Invalid SimulatorLikeWeightRatioDistribution." );
83  }
84 
85  out << "Like weight ratio intervals and weights:\n";
86  for( size_t i = 0; i < distrib.intervals.size(); ++i ) {
87  out << distrib.intervals[i] << ": " << distrib.weights[i] << "\n";
88  }
89  return out;
90 }
91 
92 // =================================================================================================
93 // Edge Distribution : Set Weights
94 // =================================================================================================
95 
96 // -----------------------------------------------------
97 // set_uniform_weights
98 // -----------------------------------------------------
99 
106 void set_uniform_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
107 {
108  set_uniform_weights( sample.tree().edge_count(), edge_distrib );
109 }
110 
116 {
117  edge_distrib.edge_weights = std::vector<double>( edge_count, 1.0 );
118 }
119 
120 // -----------------------------------------------------
121 // set_random_weights
122 // -----------------------------------------------------
123 
130 void set_random_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
131 {
132  set_random_weights( sample.tree().edge_count(), edge_distrib );
133 }
134 
140 {
141  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
142 
143  std::uniform_real_distribution<double> distrib( 0.0, 1.0 );
144  for( size_t i = 0; i < edge_count; ++i ) {
145  edge_distrib.edge_weights[i] = distrib( utils::Options::get().random_engine() );
146  }
147 }
148 
149 // -----------------------------------------------------
150 // set_random_edges
151 // -----------------------------------------------------
152 
159 void set_random_edges( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
160 {
161  set_random_edges( sample.tree().edge_count(), edge_distrib );
162 }
163 
169 {
170  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
171 
172  std::bernoulli_distribution distrib;
173  for (size_t i = 0; i < edge_count; ++i) {
174  if( distrib( utils::Options::get().random_engine() )) {
175  edge_distrib.edge_weights[i] = 1.0;
176  }
177  }
178 }
179 
180 // -----------------------------------------------------
181 // set_depths_distributed_weights
182 // -----------------------------------------------------
183 
196 {
197  auto depth_weights = closest_leaf_weight_distribution( sample );
198  set_depths_distributed_weights( sample, depth_weights, edge_distrib );
199 }
200 
219  Sample const& sample,
220  std::vector<double> const& depth_weights,
221  SimulatorEdgeDistribution& edge_distrib
222 ) {
223  // TODO Some of the code is copied from Sample::closest_leaf_weight_distribution(). Maybe
224  // it is worth putting this into a method which returns the closest leaf for edges instead
225  // of nodes.
226 
227  // Prepare weights vector.
228  size_t num_edges = sample.tree().edge_count();
229  edge_distrib.edge_weights = std::vector<double>(num_edges, 0.0);
230 
231  // Get a vector telling us the depth from each node to its closest leaf node.
232  auto depths = closest_leaf_depth_vector(sample.tree());
233 
234  // Set the weight of each edge according to its depth in the tree.
235  for (auto it = sample.tree().begin_edges(); it != sample.tree().end_edges(); ++it) {
236  // Try both nodes at the end of the edge and see which one is closer to a leaf.
237  int dp = depths[(*it)->primary_node().index()].second;
238  int ds = depths[(*it)->secondary_node().index()].second;
239  unsigned int ld = std::min(dp, ds);
240 
241  // Some safty. This holds as long as the indices are correct.
242  assert((*it)->index() < num_edges);
243 
244  // If the depth of the current edge is in the depth vector, use it.
245  // Otherwise, the tree is deeper than the given depth vector, so use zero instead,
246  // which will result in no placements being generated on this edge.
247  if (ld < depth_weights.size()) {
248  edge_distrib.edge_weights[ (*it)->index() ] = depth_weights[ld];
249  } else {
250  edge_distrib.edge_weights[ (*it)->index() ] = 0.0;
251  }
252  }
253 }
254 
255 // -----------------------------------------------------
256 // set_random_subtree_weights
257 // -----------------------------------------------------
258 
265 size_t set_random_subtree_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
266 {
267  // Reset all edge weights.
268  size_t edge_count = sample.tree().edge_count();
269  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
270 
271  std::uniform_int_distribution<int> edge_selection( 0, edge_count - 1 );
272  size_t edge_idx = edge_selection( utils::Options::get().random_engine() );
273 
274  // Randomly select a subtree, either starting at the end of the edge in direction of the root,
275  // or away from it.
276  PlacementTreeLink const* start_link;
277  std::bernoulli_distribution dir_distrib;
278  if( dir_distrib( utils::Options::get().random_engine() )) {
279  // Primary direction
280  start_link = &sample.tree().edge_at(edge_idx).primary_link();
281  } else {
282  // Secondary direction
283  start_link = &sample.tree().edge_at(edge_idx).secondary_link();
284  }
285 
286  for (
287  auto cur_link = &start_link->next();
288  cur_link != start_link;
289  cur_link = &cur_link->outer().next()
290  ) {
291  edge_distrib.edge_weights[ cur_link->edge().index() ] = 1.0;
292  }
293 
294  return edge_idx;
295 }
296 
297 // -----------------------------------------------------
298 // set_subtree_weights
299 // -----------------------------------------------------
300 
308  Sample const& sample,
309  size_t link_index,
310  SimulatorEdgeDistribution& edge_distrib
311 ) {
312  // Validity checks.
313  if( link_index >= sample.tree().link_count() ) {
314  throw std::runtime_error( "Invalid link index for subtree." );
315  }
316  if( is_leaf( sample.tree().link_at( link_index ) )) {
317  throw std::runtime_error( "Cannot use a leaf node as subtree." );
318  }
319 
320  // Reset all edge weights.
321  auto edge_count = sample.tree().edge_count();
322  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
323 
324  // Iterate the subtree and set edge weights.
325  PlacementTreeLink const* start_link = &sample.tree().link_at( link_index );
326  for (
327  auto cur_link = &start_link->next();
328  cur_link != start_link;
329  cur_link = &cur_link->outer().next()
330  ) {
331  edge_distrib.edge_weights[ cur_link->edge().index() ] = 1.0;
332  }
333 }
334 
335 // -----------------------------------------------------
336 // learn_per_edge_weights
337 // -----------------------------------------------------
338 
350 void learn_per_edge_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
351 {
353 }
354 
355 // =================================================================================================
356 // Placement Number Distribution
357 // =================================================================================================
358 
360  Sample const& sample,
362 ) {
363  auto weights = std::vector<double>();
364  for( auto const& pquery : sample.pqueries() ) {
365  size_t extra_placements = pquery.placement_size() - 1;
366  if( weights.size() < extra_placements + 1 ) {
367  weights.resize( extra_placements + 1 );
368  }
369  ++weights[ extra_placements ];
370  }
371  p_distib.placement_number_weights = weights;
372 }
373 
375  Sample const& sample,
377 ) {
378  // The distance (path length) between two placements is the number of nodes between them.
379  // Get a matrix that gives us this number for each pair of edges of the tree.
380  auto edge_dist_matrix = tree::edge_path_length_matrix( sample.tree() );
381 
382  // Iterate all pqueries and collect the distances between all of their placements, where
383  // the distance is the number of nodes between them.
384  auto weights = std::vector<double>();
385  for( auto const& pquery : sample.pqueries() ) {
386  for( auto const& place_from : pquery.placements() ) {
387  for( auto const& place_to : pquery.placements() ) {
388  size_t dist = edge_dist_matrix(
389  place_from.edge().index(), place_to.edge().index()
390  );
391 
392  // The following assertion holds as long as our dist matrix is correct.
393  // If assertions are disabled (release), the whole condition should be eliminated
394  // from the binary, so there is no speed penalty.
395  if( place_from.edge().index() == place_to.edge().index() ) {
396  assert( dist == 0 );
397  }
398 
399  // We don't need paths of length 0 currently. They will be eliminated in the
400  // distrubution class anyway, when calling prepare().
401  if( dist == 0 ) {
402  continue;
403  }
404  if( weights.size() < dist + 1 ) {
405  weights.resize( dist + 1 );
406  }
407  weights[ dist ] += 1.0;
408  }
409  }
410  }
411  p_distib.placement_path_length_weights = weights;
412 }
413 
414 // =================================================================================================
415 // Like Weight Ratio Distribution
416 // =================================================================================================
417 
419  Sample const& sample,
421  size_t number_of_intervals
422 ) {
423  // Init the result vectors (we need one more element than number of intervals).
424  auto intervals = std::vector<double>( number_of_intervals + 1, 0.0 );
425  auto weights = std::vector<double>( number_of_intervals + 1, 0.0 );
426  for( size_t i = 0; i < number_of_intervals + 1; ++i ) {
427  intervals[i] = static_cast<double>( i ) / static_cast<double>( number_of_intervals );
428  }
429 
430  // Iterate all placements and use their rounded lwr value to increase the weights of the
431  // distribution.
432  for( auto const& pquery : sample.pqueries() ) {
433  for( auto const& placement : pquery.placements() ) {
434 
435  double rounded_lwr = std::round( placement.like_weight_ratio * number_of_intervals )
436  / number_of_intervals;
437 
438  if( 0.0 > rounded_lwr || rounded_lwr > 1.0 ) {
439  throw std::runtime_error( "Invalid like_weight_ratio in Sample." );
440  }
441  size_t pos = static_cast<size_t>( rounded_lwr * number_of_intervals );
442  assert( pos < weights.size() );
443  weights[ pos ] += 1.0;
444  }
445  }
446 
447  // Set the result.
448  lwr_distib.intervals = intervals;
449  lwr_distib.weights = weights;
450 }
451 
452 } // namespace placement
453 } // namespace genesis
void set_depths_distributed_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Set the weights of an SimulatorEdgeDistribution so that they follow the depth distribution of the edg...
IteratorEdges begin_edges()
Definition: tree/tree.cpp:498
void set_random_edges(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Set the weights of a SimulatorEdgeDistribution randomly to either 0.0 or 1.0, so that a random subset...
IteratorEdges end_edges()
Definition: tree/tree.cpp:503
TreeLink & secondary_link()
Return the TreeLink of this TreeEdge that points away from the root.
Definition: edge.hpp:129
std::vector< double > closest_leaf_weight_distribution(Sample const &sample)
size_t set_random_subtree_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Sets the weights of an SimulatorEdgeDistribution to 1.0 for a randomly chosen subtree, all others to 0.0.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
std::string join(T const &v, std::string const &delimiter)
Return a string where the elements of a container v are joined using the string delimiter in between ...
Definition: string.hpp:399
Provides functions for working with Placements and Pqueries.
void set_uniform_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Sets the weights of an SimulatorEdgeDistribution to 1.0 for all edges, so that each edge has the same...
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
Provides some valuable additions to STD.
TreeLink & link_at(size_t index)
Return the TreeLink at a certain index.
Definition: tree/tree.cpp:284
utils::Matrix< size_t > edge_path_length_matrix(Tree const &tree)
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
void learn_placement_path_length_weights(Sample const &sample, SimulatorExtraPlacementDistribution &p_distib)
void learn_like_weight_ratio_distribution(Sample const &sample, SimulatorLikeWeightRatioDistribution &lwr_distib, size_t number_of_intervals)
void set_random_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Set the weights of an SimulatorEdgeDistribution for the edges randomly to a value between 0...
void learn_per_edge_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Sets the weights of an SimulatorEdgeDistributionso that they follow the same distribution of placemen...
Provides some commonly used string utility functions.
void learn_placement_number_weights(Sample const &sample, SimulatorExtraPlacementDistribution &p_distib)
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
Generate a certain number of additional PqueryPlacements around a given PlacementTreeEdge.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
std::vector< double > placement_mass_per_edges_with_multiplicities(Sample const &sample)
Return a vector that contains the sum of the masses of the PqueryPlacements per edge of the tree of t...
Definition: masses.cpp:87
Header of Tree distance methods.
std::ostream & operator<<(std::ostream &out, Sample const &smp)
Print a table of all Pqueries with their Placements and Names to the stream.
TreeLink & primary_link()
Return the TreeLink of this TreeEdge that points towards the root.
Definition: edge.hpp:113
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:60
size_t link_count() const
Return the number of TreeLinks of the Tree.
Definition: tree/tree.cpp:342
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
void set_subtree_weights(Sample const &sample, size_t link_index, SimulatorEdgeDistribution &edge_distrib)
Set the weights of a subtree to 1.0 and all other weights to 0.0.