A library for working with phylogenetic and population genetic data.
v0.32.0
placement/simulator/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
38 
41 
45 
46 #include <cassert>
47 #include <cmath>
48 #include <ostream>
49 #include <stdexcept>
50 #include <vector>
51 
52 namespace genesis {
53 namespace placement {
54 
55 // =================================================================================================
56 // Stream Output Operators
57 // =================================================================================================
58 
59 std::ostream& operator << ( std::ostream& out, SimulatorEdgeDistribution const& distrib )
60 {
61  out << "Weight of each edge: ";
62  out << utils::join( distrib.edge_weights, ", ") << "\n";
63  return out;
64 }
65 
66 std::ostream& operator << ( std::ostream& out, SimulatorExtraPlacementDistribution const& distrib )
67 {
68  out << "Extra placement weights:\n";
69  for( size_t i = 0; i < distrib.placement_number_weights.size(); ++i ) {
70  out << i << ": " << distrib.placement_number_weights[i] << "\n";
71  }
72  out << "Path length weights:\n";
73  for( size_t i = 0; i < distrib.placement_path_length_weights.size(); ++i ) {
74  out << i << ": " << distrib.placement_path_length_weights[i] << "\n";
75  }
76  return out;
77 }
78 
79 std::ostream& operator << ( std::ostream& out, SimulatorLikeWeightRatioDistribution const& distrib )
80 {
81  if( distrib.intervals.size() != distrib.weights.size() ) {
82  throw std::logic_error( "Invalid SimulatorLikeWeightRatioDistribution." );
83  }
84 
85  out << "Like weight ratio intervals and weights:\n";
86  for( size_t i = 0; i < distrib.intervals.size(); ++i ) {
87  out << distrib.intervals[i] << ": " << distrib.weights[i] << "\n";
88  }
89  return out;
90 }
91 
92 // =================================================================================================
93 // Edge Distribution : Set Weights
94 // =================================================================================================
95 
96 // -----------------------------------------------------
97 // set_uniform_weights
98 // -----------------------------------------------------
99 
106 void set_uniform_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
107 {
108  set_uniform_weights( sample.tree().edge_count(), edge_distrib );
109 }
110 
116 {
117  edge_distrib.edge_weights = std::vector<double>( edge_count, 1.0 );
118 }
119 
120 // -----------------------------------------------------
121 // set_random_weights
122 // -----------------------------------------------------
123 
130 void set_random_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
131 {
132  set_random_weights( sample.tree().edge_count(), edge_distrib );
133 }
134 
140 {
141  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
142 
143  std::uniform_real_distribution<double> distrib( 0.0, 1.0 );
144  for( size_t i = 0; i < edge_count; ++i ) {
145  edge_distrib.edge_weights[i] = distrib( utils::Options::get().random_engine() );
146  }
147 }
148 
149 // -----------------------------------------------------
150 // set_random_edges
151 // -----------------------------------------------------
152 
159 void set_random_edges( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
160 {
161  set_random_edges( sample.tree().edge_count(), edge_distrib );
162 }
163 
169 {
170  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
171 
172  std::bernoulli_distribution distrib;
173  for (size_t i = 0; i < edge_count; ++i) {
174  if( distrib( utils::Options::get().random_engine() )) {
175  edge_distrib.edge_weights[i] = 1.0;
176  }
177  }
178 }
179 
180 // -----------------------------------------------------
181 // set_depths_distributed_weights
182 // -----------------------------------------------------
183 
196 {
197  auto depth_weights = closest_leaf_weight_distribution( sample );
198  set_depths_distributed_weights( sample, depth_weights, edge_distrib );
199 }
200 
219  Sample const& sample,
220  std::vector<double> const& depth_weights,
221  SimulatorEdgeDistribution& edge_distrib
222 ) {
223  // TODO Some of the code is copied from Sample::closest_leaf_weight_distribution(). Maybe
224  // it is worth putting this into a method which returns the closest leaf for edges instead
225  // of nodes.
226 
227  // Prepare weights vector.
228  size_t num_edges = sample.tree().edge_count();
229  edge_distrib.edge_weights = std::vector<double>(num_edges, 0.0);
230 
231  // Get a vector telling us the depth from each node to its closest leaf node.
232  auto depths = closest_leaf_depth_vector(sample.tree());
233 
234  // Set the weight of each edge according to its depth in the tree.
235  for( auto const& edge : sample.tree().edges() ) {
236  // Try both nodes at the end of the edge and see which one is closer to a leaf.
237  auto dp = depths[ edge.primary_node().index() ].second;
238  auto ds = depths[ edge.secondary_node().index() ].second;
239  auto ld = std::min(dp, ds);
240 
241  // Some safty. This holds as long as the indices are correct.
242  assert( edge.index() < num_edges );
243 
244  // If the depth of the current edge is in the depth vector, use it.
245  // Otherwise, the tree is deeper than the given depth vector, so use zero instead,
246  // which will result in no placements being generated on this edge.
247  if (ld < depth_weights.size()) {
248  edge_distrib.edge_weights[ edge.index() ] = depth_weights[ld];
249  } else {
250  edge_distrib.edge_weights[ edge.index() ] = 0.0;
251  }
252  }
253 }
254 
255 // -----------------------------------------------------
256 // set_random_subtree_weights
257 // -----------------------------------------------------
258 
265 size_t set_random_subtree_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
266 {
267  if( sample.tree().empty() ) {
268  throw std::runtime_error("Cannot set weights using an empty Tree.");
269  }
270 
271  // Reset all edge weights.
272  size_t edge_count = sample.tree().edge_count();
273  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
274 
275  assert( edge_count > 0 );
276  std::uniform_int_distribution<size_t> edge_selection( 0, edge_count - 1 );
277  size_t edge_idx = edge_selection( utils::Options::get().random_engine() );
278 
279  // Randomly select a subtree, either starting at the end of the edge in direction of the root,
280  // or away from it.
281  PlacementTreeLink const* start_link;
282  std::bernoulli_distribution dir_distrib;
283  if( dir_distrib( utils::Options::get().random_engine() )) {
284  // Primary direction
285  start_link = &sample.tree().edge_at(edge_idx).primary_link();
286  } else {
287  // Secondary direction
288  start_link = &sample.tree().edge_at(edge_idx).secondary_link();
289  }
290 
291  for (
292  auto cur_link = &start_link->next();
293  cur_link != start_link;
294  cur_link = &cur_link->outer().next()
295  ) {
296  edge_distrib.edge_weights[ cur_link->edge().index() ] = 1.0;
297  }
298 
299  return edge_idx;
300 }
301 
302 // -----------------------------------------------------
303 // set_subtree_weights
304 // -----------------------------------------------------
305 
313  Sample const& sample,
314  size_t link_index,
315  SimulatorEdgeDistribution& edge_distrib
316 ) {
317  // Validity checks.
318  if( link_index >= sample.tree().link_count() ) {
319  throw std::runtime_error( "Invalid link index for subtree." );
320  }
321  if( is_leaf( sample.tree().link_at( link_index ) )) {
322  throw std::runtime_error( "Cannot use a leaf node as subtree." );
323  }
324 
325  // Reset all edge weights.
326  auto edge_count = sample.tree().edge_count();
327  edge_distrib.edge_weights = std::vector<double>( edge_count, 0.0 );
328 
329  // Iterate the subtree and set edge weights.
330  PlacementTreeLink const* start_link = &sample.tree().link_at( link_index );
331  for (
332  auto cur_link = &start_link->next();
333  cur_link != start_link;
334  cur_link = &cur_link->outer().next()
335  ) {
336  edge_distrib.edge_weights[ cur_link->edge().index() ] = 1.0;
337  }
338 }
339 
340 // -----------------------------------------------------
341 // learn_per_edge_weights
342 // -----------------------------------------------------
343 
355 void learn_per_edge_weights( Sample const& sample, SimulatorEdgeDistribution& edge_distrib )
356 {
358 }
359 
360 // =================================================================================================
361 // Placement Number Distribution
362 // =================================================================================================
363 
365  Sample const& sample,
367 ) {
368  auto weights = std::vector<double>();
369  for( auto const& pquery : sample.pqueries() ) {
370  size_t extra_placements = pquery.placement_size() - 1;
371  if( weights.size() < extra_placements + 1 ) {
372  weights.resize( extra_placements + 1 );
373  }
374  ++weights[ extra_placements ];
375  }
376  p_distib.placement_number_weights = weights;
377 }
378 
380  Sample const& sample,
382 ) {
383  // The distance (path length) between two placements is the number of nodes between them.
384  // Get a matrix that gives us this number for each pair of edges of the tree.
385  auto edge_dist_matrix = tree::edge_path_length_matrix( sample.tree() );
386 
387  // Iterate all pqueries and collect the distances between all of their placements, where
388  // the distance is the number of nodes between them.
389  auto weights = std::vector<double>();
390  for( auto const& pquery : sample.pqueries() ) {
391  for( auto const& place_from : pquery.placements() ) {
392  for( auto const& place_to : pquery.placements() ) {
393  size_t dist = edge_dist_matrix(
394  place_from.edge().index(), place_to.edge().index()
395  );
396 
397  // The following assertion holds as long as our dist matrix is correct.
398  // If assertions are disabled (release), the whole condition should be eliminated
399  // from the binary, so there is no speed penalty.
400  if( place_from.edge().index() == place_to.edge().index() ) {
401  assert( dist == 0 );
402  }
403 
404  // We don't need paths of length 0 currently. They will be eliminated in the
405  // distrubution class anyway, when calling prepare().
406  if( dist == 0 ) {
407  continue;
408  }
409  if( weights.size() < dist + 1 ) {
410  weights.resize( dist + 1 );
411  }
412  weights[ dist ] += 1.0;
413  }
414  }
415  }
416  p_distib.placement_path_length_weights = weights;
417 }
418 
419 // =================================================================================================
420 // Like Weight Ratio Distribution
421 // =================================================================================================
422 
424  Sample const& sample,
426  size_t number_of_intervals
427 ) {
428  // Init the result vectors (we need one more element than number of intervals).
429  auto intervals = std::vector<double>( number_of_intervals + 1, 0.0 );
430  auto weights = std::vector<double>( number_of_intervals + 1, 0.0 );
431  for( size_t i = 0; i < number_of_intervals + 1; ++i ) {
432  intervals[i] = static_cast<double>( i ) / static_cast<double>( number_of_intervals );
433  }
434 
435  // Iterate all placements and use their rounded lwr value to increase the weights of the
436  // distribution.
437  for( auto const& pquery : sample.pqueries() ) {
438  for( auto const& placement : pquery.placements() ) {
439 
440  double rounded_lwr = std::round( placement.like_weight_ratio * number_of_intervals )
441  / number_of_intervals;
442 
443  if( 0.0 > rounded_lwr || rounded_lwr > 1.0 ) {
444  throw std::runtime_error( "Invalid like_weight_ratio in Sample." );
445  }
446  size_t pos = static_cast<size_t>( rounded_lwr * number_of_intervals );
447  assert( pos < weights.size() );
448  weights[ pos ] += 1.0;
449  }
450  }
451 
452  // Set the result.
453  lwr_distib.intervals = intervals;
454  lwr_distib.weights = weights;
455 }
456 
457 } // namespace placement
458 } // namespace genesis
genesis::placement::SimulatorLikeWeightRatioDistribution
Definition: distributions.hpp:185
genesis::tree::TreeEdge::primary_link
TreeLink & primary_link()
Return the TreeLink of this TreeEdge that points towards the root.
Definition: edge.hpp:114
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::placement::SimulatorEdgeDistribution::edge_weights
std::vector< double > edge_weights
Definition: distributions.hpp:105
genesis::tree::Tree::link_count
size_t link_count() const
Return the number of TreeLinks of the Tree.
Definition: tree/tree.hpp:256
genesis::placement::set_random_edges
void set_random_edges(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Set the weights of a SimulatorEdgeDistribution randomly to either 0.0 or 1.0, so that a random subset...
Definition: placement/simulator/functions.cpp:159
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
functions.hpp
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
std.hpp
Provides some valuable additions to STD.
genesis::placement::learn_per_edge_weights
void learn_per_edge_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Sets the weights of an SimulatorEdgeDistributionso that they follow the same distribution of placemen...
Definition: placement/simulator/functions.cpp:355
functions.hpp
Provides functions for working with Placements and Pqueries.
genesis::placement::SimulatorExtraPlacementDistribution
Generate a certain number of additional PqueryPlacements around a given PlacementTreeEdge.
Definition: distributions.hpp:126
genesis::tree::edge_path_length_matrix
utils::Matrix< size_t > edge_path_length_matrix(Tree const &tree)
Definition: tree/function/distances.cpp:144
distributions.hpp
genesis::placement::set_random_subtree_weights
size_t set_random_subtree_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Sets the weights of an SimulatorEdgeDistribution to 1.0 for a randomly chosen subtree,...
Definition: placement/simulator/functions.cpp:265
genesis::placement::SimulatorEdgeDistribution
Definition: distributions.hpp:49
string.hpp
Provides some commonly used string utility functions.
genesis::placement::set_uniform_weights
void set_uniform_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Sets the weights of an SimulatorEdgeDistribution to 1.0 for all edges, so that each edge has the same...
Definition: placement/simulator/functions.cpp:106
genesis::tree::Tree::empty
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
masses.hpp
genesis::placement::SimulatorLikeWeightRatioDistribution::intervals
std::vector< double > intervals
Definition: distributions.hpp:225
matrix.hpp
genesis::utils::join
Interval< DataType, NumericalType, IntervalKind > join(Interval< DataType, NumericalType, IntervalKind > const &a, Interval< DataType, NumericalType, IntervalKind > const &b)
Creates a new Interval that contains both intervals and whatever is between.
Definition: utils/containers/interval_tree/functions.hpp:127
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::placement::learn_like_weight_ratio_distribution
void learn_like_weight_ratio_distribution(Sample const &sample, SimulatorLikeWeightRatioDistribution &lwr_distib, size_t number_of_intervals)
Definition: placement/simulator/functions.cpp:423
genesis::placement::SimulatorLikeWeightRatioDistribution::weights
std::vector< double > weights
Definition: distributions.hpp:226
genesis::tree::Tree::edges
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
genesis::placement::set_subtree_weights
void set_subtree_weights(Sample const &sample, size_t link_index, SimulatorEdgeDistribution &edge_distrib)
Set the weights of a subtree to 1.0 and all other weights to 0.0.
Definition: placement/simulator/functions.cpp:312
genesis::placement::learn_placement_path_length_weights
void learn_placement_path_length_weights(Sample const &sample, SimulatorExtraPlacementDistribution &p_distib)
Definition: placement/simulator/functions.cpp:379
functions.hpp
genesis::placement::closest_leaf_weight_distribution
std::vector< double > closest_leaf_weight_distribution(Sample const &sample)
Definition: placement/function/functions.cpp:830
genesis::placement::set_depths_distributed_weights
void set_depths_distributed_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Set the weights of an SimulatorEdgeDistribution so that they follow the depth distribution of the edg...
Definition: placement/simulator/functions.cpp:195
genesis::placement::placement_mass_per_edges_with_multiplicities
std::vector< double > placement_mass_per_edges_with_multiplicities(Sample const &sample)
Return a vector that contains the sum of the masses of the PqueryPlacements per edge of the tree of t...
Definition: masses.cpp:87
genesis::tree::Tree::link_at
TreeLink & link_at(size_t index)
Return the TreeLink at a certain index.
Definition: tree/tree.hpp:202
genesis::placement::set_random_weights
void set_random_weights(Sample const &sample, SimulatorEdgeDistribution &edge_distrib)
Set the weights of an SimulatorEdgeDistribution for the edges randomly to a value between 0....
Definition: placement/simulator/functions.cpp:130
genesis::utils::Options::get
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:68
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
distances.hpp
Header of Tree distance methods.
genesis::placement::SimulatorExtraPlacementDistribution::placement_path_length_weights
std::vector< double > placement_path_length_weights
Definition: distributions.hpp:171
genesis::placement::Sample::pqueries
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:264
genesis::tree::TreeEdge::secondary_link
TreeLink & secondary_link()
Return the TreeLink of this TreeEdge that points away from the root.
Definition: edge.hpp:130
genesis::placement::operator<<
std::ostream & operator<<(std::ostream &out, Sample const &smp)
Print a table of all Pqueries with their Placements and Names to the stream.
Definition: placement/function/operators.cpp:238
genesis::placement::learn_placement_number_weights
void learn_placement_number_weights(Sample const &sample, SimulatorExtraPlacementDistribution &p_distib)
Definition: placement/simulator/functions.cpp:364
genesis::tree::edge_count
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
Definition: tree/function/functions.cpp:212
genesis::tree::closest_leaf_depth_vector
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
Definition: tree/function/distances.cpp:251
helper.hpp
genesis::placement::SimulatorExtraPlacementDistribution::placement_number_weights
std::vector< double > placement_number_weights
Definition: distributions.hpp:170
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272