A library for working with phylogenetic and population genetic data.
v0.32.0
distributions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <numeric>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace placement {
45 
46 // =================================================================================================
47 // Placement Number Distribution
48 // =================================================================================================
49 
57 {
58  // Make sure to never produce a placement on the central edge. We only want to produce
59  // additional placements, as we assume that the simulator already placed on the central edge.
60  if( placement_path_length_weights.size() > 0 ) {
62  }
63 
64  // Init the distribs.
65  placement_number_distrib_ = std::discrete_distribution<size_t>(
68  );
69  placement_path_length_distrib_ = std::discrete_distribution<size_t>(
72  );
73 
74  // If we are never ever creating additional placements (so, either we have 0 or 1 weights for
75  // the distribution), we can skip the part with edge candidates.
76  // Those are only used when creating additional placements.
77  if( placement_number_weights.size() < 2 ) {
78  return;
79  }
80 
81  // Init a matrix with the path lengths from all edges to each other
82  // (that is, the number of nodes between them).
83  auto edge_dist_matrix = tree::edge_path_length_matrix( sample.tree() );
84 
85  // For each edge, create a list of other edges in its proximity, sorted by their
86  // distance level (number of nodes in between) from that edge.
87  edge_proximities_.clear();
88  edge_proximities_.resize( edge_dist_matrix.rows() );
89  for( size_t edge_idx = 0; edge_idx < edge_dist_matrix.rows(); ++edge_idx ) {
90  for( size_t prox_idx = 0; prox_idx < edge_dist_matrix.cols(); ++prox_idx ) {
91  size_t level = edge_dist_matrix( edge_idx, prox_idx );
92 
93  // We do not want to add the edge itself. This would mean to add more than one
94  // placement on that edge.
95  if( level == 0 ) {
96  assert( edge_idx == prox_idx );
97  continue;
98  }
99 
100  // This list will contain all other edges of the tree in the end. We want to
101  // reduce this to only a certain level, depending on the maximal length of a path that
102  // we can ever generate according to the given distribution.
103  if( level >= placement_path_length_weights.size() ) {
104  continue;
105  }
106 
107  if( edge_proximities_[ edge_idx ].candidates_per_level.size() < level + 1 ) {
108  edge_proximities_[ edge_idx ].candidates_per_level.resize( level + 1 );
109  }
110  edge_proximities_[ edge_idx ].candidates_per_level[ level ].push_back( prox_idx );
111  edge_proximities_[ edge_idx ].total_candidates += 1;
112  }
113  }
114 }
115 
120 {
121  // Draw a number of placements and build a result vector of that size.
122  size_t placement_num = placement_number_distrib_( utils::Options::get().random_engine() );
123  auto result = std::vector<size_t>( placement_num );
124 
125  // If we are not creating any additional placements, we can skip the next steps.
126  if( placement_num == 0 ) {
127  return result;
128  }
129 
130  // We make sure that an edge gets at most one placement per pquery by maintaining a list of
131  // possible candidate edges that do not have a placement (for this pquery) yet.
132  // For this, get a list of all possible candidates of neighbouring edges of the given edge.
133  // We shuffle them so that we take different edges for every pquery.
134  auto edge_prox = edge_proximities_[ edge.index() ];
135  for( auto& candidates : edge_prox.candidates_per_level ) {
136  std::random_shuffle( candidates.begin() , candidates.end() );
137  }
138 
139  // We can only place as many placements as there are neighbouring edges.
140  // Only happens in very small trees, but we need this to avoid an endless loop later.
141  if( placement_num > edge_prox.total_candidates ) {
142  placement_num = edge_prox.total_candidates;
143  }
144 
145  // Now create as many more placement positions as needed.
146  for( size_t i = 0; i < placement_num; ++i ) {
147 
148  // Loop until we found an edge where this placement can go.
149  bool placed = false;
150  do {
151  // Draw randomly a value used to determine the distance of this placement from the
152  // central one. As we set the weight for path length 0 to 0.0, there should never
153  // be a path of 0 length, so assert it.
154  size_t path_len = placement_path_length_distrib_(
155  utils::Options::get().random_engine()
156  );
157  assert( path_len > 0 );
158 
159  // If we drew a path for which all edges of that distance are already used, we cannot
160  // use it, so skip it.
161  if( edge_prox.candidates_per_level[ path_len ].size() == 0 ) {
162  continue;
163  }
164 
165  auto place_edge_num = edge_prox.candidates_per_level[ path_len ].back();
166  edge_prox.candidates_per_level[ path_len ].pop_back();
167 
168  result[i] = place_edge_num;
169  placed = true;
170  } while( ! placed );
171  }
172 
173  return result;
174 }
175 
180 {
181  std::string result;
182 
183  for( size_t edge_idx = 0; edge_idx < edge_proximities_.size(); ++edge_idx ) {
184  auto& prox = edge_proximities_[ edge_idx ];
185  result += "Edge at index " + std::to_string( edge_idx ) + ":\n";
186 
187  for( size_t cand_idx = 0; cand_idx < prox.candidates_per_level.size(); ++cand_idx ) {
188  auto& cand = prox.candidates_per_level[ cand_idx ];
189 
190  result += " Level " + std::to_string( cand_idx ) + ": ";
191  result += std::to_string( cand.size() ) + " candidates\n";
192  }
193  }
194 
195  return result;
196 }
197 
202 {
203  auto result = std::vector<size_t>();
204 
205  for( auto const& prox : edge_proximities_ ) {
206  for( size_t cand_idx = 0; cand_idx < prox.candidates_per_level.size(); ++cand_idx ) {
207  auto& cand = prox.candidates_per_level[ cand_idx ];
208 
209  if( result.size() < cand_idx + 1 ) {
210  result.resize( cand_idx + 1 );
211  }
212  result[ cand_idx ] = std::max( result[ cand_idx ], cand.size() );
213  }
214  }
215 
216  return result;
217 }
218 
219 // =================================================================================================
220 // Like Weight Ratio Distribution
221 // =================================================================================================
222 
224 {
225  (void) sample;
226 
227  // Check condition.
228  if( intervals.size() != weights.size() ) {
229  throw std::logic_error(
230  "The number of intervals and weights has to identical for "
231  "SimulatorLikeWeightRatioDistribution."
232  );
233  }
234  if( ! std::is_sorted( intervals.begin(), intervals.end() )) {
235  throw std::logic_error(
236  "Intervals need to be sorted in SimulatorLikeWeightRatioDistribution."
237  );
238  }
239  if( std::any_of( weights.begin(), weights.end(), [] ( double v ) { return v < 0.0; } )) {
240  throw std::logic_error(
241  "Weights need to be non-negative in SimulatorLikeWeightRatioDistribution."
242  );
243  }
244 
245  // Set distribution.
246  distrib_ = std::piecewise_linear_distribution<double>(
247  intervals.begin(),
248  intervals.end(),
249  weights.begin()
250  );
251 }
252 
253 } // namespace placement
254 } // namespace genesis
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
genesis::tree::TreeEdge::index
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:106
genesis::placement::SimulatorExtraPlacementDistribution::dump_edge_proximities
std::string dump_edge_proximities() const
Definition: distributions.cpp:179
std.hpp
Provides some valuable additions to STD.
genesis::tree::edge_path_length_matrix
utils::Matrix< size_t > edge_path_length_matrix(Tree const &tree)
Definition: tree/function/distances.cpp:144
distributions.hpp
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::placement::SimulatorLikeWeightRatioDistribution::prepare
void prepare(Sample const &sample)
Prepare the distribution for usage. Needs to be called before generate().
Definition: distributions.cpp:223
genesis::placement::SimulatorExtraPlacementDistribution::prepare
void prepare(Sample const &sample)
Prepare the distribution for usage. Needs to be called before generate().
Definition: distributions.cpp:56
genesis::placement::SimulatorLikeWeightRatioDistribution::intervals
std::vector< double > intervals
Definition: distributions.hpp:225
matrix.hpp
genesis::tree::TreeEdge
Definition: edge.hpp:60
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::placement::SimulatorLikeWeightRatioDistribution::weights
std::vector< double > weights
Definition: distributions.hpp:226
options.hpp
genesis::placement::SimulatorExtraPlacementDistribution::generate
std::vector< size_t > generate(PlacementTreeEdge const &edge)
Definition: distributions.cpp:119
genesis::utils::Options::get
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:68
distances.hpp
Header of Tree distance methods.
genesis::placement::SimulatorExtraPlacementDistribution::placement_path_length_weights
std::vector< double > placement_path_length_weights
Definition: distributions.hpp:171
genesis::placement::SimulatorExtraPlacementDistribution::placement_number_weights
std::vector< double > placement_number_weights
Definition: distributions.hpp:170
genesis::placement::SimulatorExtraPlacementDistribution::edge_proximity_maxima
std::vector< size_t > edge_proximity_maxima() const
Definition: distributions.cpp:201